1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 407 if (isa<FPMathOperator>(V)) 408 cast<Instruction>(V)->setFastMathFlags(FMF); 409 return V; 410 } 411 412 /// A helper function that returns an integer or floating-point constant with 413 /// value C. 414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 416 : ConstantFP::get(Ty, C); 417 } 418 419 /// Returns "best known" trip count for the specified loop \p L as defined by 420 /// the following procedure: 421 /// 1) Returns exact trip count if it is known. 422 /// 2) Returns expected trip count according to profile data if any. 423 /// 3) Returns upper bound estimate if it is known. 424 /// 4) Returns None if all of the above failed. 425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return None; 440 } 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), 471 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 472 BFI(BFI), PSI(PSI) { 473 // Query this against the original loop and save it here because the profile 474 // of the original loop header may change as the transformation happens. 475 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 476 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 477 } 478 479 virtual ~InnerLoopVectorizer() = default; 480 481 /// Create a new empty loop that will contain vectorized instructions later 482 /// on, while the old loop will be used as the scalar remainder. Control flow 483 /// is generated around the vectorized (and scalar epilogue) loops consisting 484 /// of various checks and bypasses. Return the pre-header block of the new 485 /// loop. 486 /// In the case of epilogue vectorization, this function is overriden to 487 /// handle the more complex control flow around the loops. 488 virtual BasicBlock *createVectorizedLoopSkeleton(); 489 490 /// Widen a single instruction within the innermost loop. 491 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 492 VPTransformState &State); 493 494 /// Widen a single call instruction within the innermost loop. 495 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 496 VPTransformState &State); 497 498 /// Widen a single select instruction within the innermost loop. 499 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 500 bool InvariantCond, VPTransformState &State); 501 502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 503 void fixVectorizedLoop(); 504 505 // Return true if any runtime check is added. 506 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 507 508 /// A type for vectorized values in the new loop. Each value from the 509 /// original loop, when vectorized, is represented by UF vector values in the 510 /// new unrolled loop, where UF is the unroll factor. 511 using VectorParts = SmallVector<Value *, 2>; 512 513 /// Vectorize a single GetElementPtrInst based on information gathered and 514 /// decisions taken during planning. 515 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 516 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 517 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 518 519 /// Vectorize a single PHINode in a block. This method handles the induction 520 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 521 /// arbitrary length vectors. 522 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 523 Value *StartV, unsigned UF, ElementCount VF); 524 525 /// A helper function to scalarize a single Instruction in the innermost loop. 526 /// Generates a sequence of scalar instances for each lane between \p MinLane 527 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 528 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 529 /// Instr's operands. 530 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 531 const VPIteration &Instance, bool IfPredicateInstr, 532 VPTransformState &State); 533 534 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 535 /// is provided, the integer induction variable will first be truncated to 536 /// the corresponding type. 537 void widenIntOrFpInduction(PHINode *IV, Value *Start, 538 TruncInst *Trunc = nullptr); 539 540 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 541 /// vector or scalar value on-demand if one is not yet available. When 542 /// vectorizing a loop, we visit the definition of an instruction before its 543 /// uses. When visiting the definition, we either vectorize or scalarize the 544 /// instruction, creating an entry for it in the corresponding map. (In some 545 /// cases, such as induction variables, we will create both vector and scalar 546 /// entries.) Then, as we encounter uses of the definition, we derive values 547 /// for each scalar or vector use unless such a value is already available. 548 /// For example, if we scalarize a definition and one of its uses is vector, 549 /// we build the required vector on-demand with an insertelement sequence 550 /// when visiting the use. Otherwise, if the use is scalar, we can use the 551 /// existing scalar definition. 552 /// 553 /// Return a value in the new loop corresponding to \p V from the original 554 /// loop at unroll index \p Part. If the value has already been vectorized, 555 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 556 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 557 /// a new vector value on-demand by inserting the scalar values into a vector 558 /// with an insertelement sequence. If the value has been neither vectorized 559 /// nor scalarized, it must be loop invariant, so we simply broadcast the 560 /// value into a vector. 561 Value *getOrCreateVectorValue(Value *V, unsigned Part); 562 563 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 564 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 565 } 566 567 /// Return a value in the new loop corresponding to \p V from the original 568 /// loop at unroll and vector indices \p Instance. If the value has been 569 /// vectorized but not scalarized, the necessary extractelement instruction 570 /// will be generated. 571 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 572 573 /// Construct the vector value of a scalarized value \p V one lane at a time. 574 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 575 576 /// Try to vectorize interleaved access group \p Group with the base address 577 /// given in \p Addr, optionally masking the vector operations if \p 578 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 579 /// values in the vectorized loop. 580 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 581 ArrayRef<VPValue *> VPDefs, 582 VPTransformState &State, VPValue *Addr, 583 ArrayRef<VPValue *> StoredValues, 584 VPValue *BlockInMask = nullptr); 585 586 /// Vectorize Load and Store instructions with the base address given in \p 587 /// Addr, optionally masking the vector operations if \p BlockInMask is 588 /// non-null. Use \p State to translate given VPValues to IR values in the 589 /// vectorized loop. 590 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 591 VPValue *Def, VPValue *Addr, 592 VPValue *StoredValue, VPValue *BlockInMask); 593 594 /// Set the debug location in the builder using the debug location in 595 /// the instruction. 596 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 597 598 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 599 void fixNonInductionPHIs(void); 600 601 protected: 602 friend class LoopVectorizationPlanner; 603 604 /// A small list of PHINodes. 605 using PhiVector = SmallVector<PHINode *, 4>; 606 607 /// A type for scalarized values in the new loop. Each value from the 608 /// original loop, when scalarized, is represented by UF x VF scalar values 609 /// in the new unrolled loop, where UF is the unroll factor and VF is the 610 /// vectorization factor. 611 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 612 613 /// Set up the values of the IVs correctly when exiting the vector loop. 614 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 615 Value *CountRoundDown, Value *EndValue, 616 BasicBlock *MiddleBlock); 617 618 /// Create a new induction variable inside L. 619 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 620 Value *Step, Instruction *DL); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(); 624 625 /// Fix a first-order recurrence. This is the second phase of vectorizing 626 /// this phi node. 627 void fixFirstOrderRecurrence(PHINode *Phi); 628 629 /// Fix a reduction cross-iteration phi. This is the second phase of 630 /// vectorizing this phi node. 631 void fixReduction(PHINode *Phi); 632 633 /// Clear NSW/NUW flags from reduction instructions if necessary. 634 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 635 636 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 637 /// means we need to add the appropriate incoming value from the middle 638 /// block as exiting edges from the scalar epilogue loop (if present) are 639 /// already in place, and we exit the vector loop exclusively to the middle 640 /// block. 641 void fixLCSSAPHIs(); 642 643 /// Iteratively sink the scalarized operands of a predicated instruction into 644 /// the block that was created for it. 645 void sinkScalarOperands(Instruction *PredInst); 646 647 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 648 /// represented as. 649 void truncateToMinimalBitwidths(); 650 651 /// Create a broadcast instruction. This method generates a broadcast 652 /// instruction (shuffle) for loop invariant values and for the induction 653 /// value. If this is the induction variable then we extend it to N, N+1, ... 654 /// this is needed because each iteration in the loop corresponds to a SIMD 655 /// element. 656 virtual Value *getBroadcastInstrs(Value *V); 657 658 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 659 /// to each vector element of Val. The sequence starts at StartIndex. 660 /// \p Opcode is relevant for FP induction variable. 661 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 662 Instruction::BinaryOps Opcode = 663 Instruction::BinaryOpsEnd); 664 665 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 666 /// variable on which to base the steps, \p Step is the size of the step, and 667 /// \p EntryVal is the value from the original loop that maps to the steps. 668 /// Note that \p EntryVal doesn't have to be an induction variable - it 669 /// can also be a truncate instruction. 670 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 671 const InductionDescriptor &ID); 672 673 /// Create a vector induction phi node based on an existing scalar one. \p 674 /// EntryVal is the value from the original loop that maps to the vector phi 675 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 676 /// truncate instruction, instead of widening the original IV, we widen a 677 /// version of the IV truncated to \p EntryVal's type. 678 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 679 Value *Step, Value *Start, 680 Instruction *EntryVal); 681 682 /// Returns true if an instruction \p I should be scalarized instead of 683 /// vectorized for the chosen vectorization factor. 684 bool shouldScalarizeInstruction(Instruction *I) const; 685 686 /// Returns true if we should generate a scalar version of \p IV. 687 bool needsScalarInduction(Instruction *IV) const; 688 689 /// If there is a cast involved in the induction variable \p ID, which should 690 /// be ignored in the vectorized loop body, this function records the 691 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 692 /// cast. We had already proved that the casted Phi is equal to the uncasted 693 /// Phi in the vectorized loop (under a runtime guard), and therefore 694 /// there is no need to vectorize the cast - the same value can be used in the 695 /// vector loop for both the Phi and the cast. 696 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 697 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 698 /// 699 /// \p EntryVal is the value from the original loop that maps to the vector 700 /// phi node and is used to distinguish what is the IV currently being 701 /// processed - original one (if \p EntryVal is a phi corresponding to the 702 /// original IV) or the "newly-created" one based on the proof mentioned above 703 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 704 /// latter case \p EntryVal is a TruncInst and we must not record anything for 705 /// that IV, but it's error-prone to expect callers of this routine to care 706 /// about that, hence this explicit parameter. 707 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 708 const Instruction *EntryVal, 709 Value *VectorLoopValue, 710 unsigned Part, 711 unsigned Lane = UINT_MAX); 712 713 /// Generate a shuffle sequence that will reverse the vector Vec. 714 virtual Value *reverseVector(Value *Vec); 715 716 /// Returns (and creates if needed) the original loop trip count. 717 Value *getOrCreateTripCount(Loop *NewLoop); 718 719 /// Returns (and creates if needed) the trip count of the widened loop. 720 Value *getOrCreateVectorTripCount(Loop *NewLoop); 721 722 /// Returns a bitcasted value to the requested vector type. 723 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 724 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 725 const DataLayout &DL); 726 727 /// Emit a bypass check to see if the vector trip count is zero, including if 728 /// it overflows. 729 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 730 731 /// Emit a bypass check to see if all of the SCEV assumptions we've 732 /// had to make are correct. 733 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 734 735 /// Emit bypass checks to check any memory assumptions we may have made. 736 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 737 738 /// Compute the transformed value of Index at offset StartValue using step 739 /// StepValue. 740 /// For integer induction, returns StartValue + Index * StepValue. 741 /// For pointer induction, returns StartValue[Index * StepValue]. 742 /// FIXME: The newly created binary instructions should contain nsw/nuw 743 /// flags, which can be found from the original scalar operations. 744 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 745 const DataLayout &DL, 746 const InductionDescriptor &ID) const; 747 748 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 749 /// vector loop preheader, middle block and scalar preheader. Also 750 /// allocate a loop object for the new vector loop and return it. 751 Loop *createVectorLoopSkeleton(StringRef Prefix); 752 753 /// Create new phi nodes for the induction variables to resume iteration count 754 /// in the scalar epilogue, from where the vectorized loop left off (given by 755 /// \p VectorTripCount). 756 /// In cases where the loop skeleton is more complicated (eg. epilogue 757 /// vectorization) and the resume values can come from an additional bypass 758 /// block, the \p AdditionalBypass pair provides information about the bypass 759 /// block and the end value on the edge from bypass to this loop. 760 void createInductionResumeValues( 761 Loop *L, Value *VectorTripCount, 762 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 763 764 /// Complete the loop skeleton by adding debug MDs, creating appropriate 765 /// conditional branches in the middle block, preparing the builder and 766 /// running the verifier. Take in the vector loop \p L as argument, and return 767 /// the preheader of the completed vector loop. 768 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 769 770 /// Add additional metadata to \p To that was not present on \p Orig. 771 /// 772 /// Currently this is used to add the noalias annotations based on the 773 /// inserted memchecks. Use this for instructions that are *cloned* into the 774 /// vector loop. 775 void addNewMetadata(Instruction *To, const Instruction *Orig); 776 777 /// Add metadata from one instruction to another. 778 /// 779 /// This includes both the original MDs from \p From and additional ones (\see 780 /// addNewMetadata). Use this for *newly created* instructions in the vector 781 /// loop. 782 void addMetadata(Instruction *To, Instruction *From); 783 784 /// Similar to the previous function but it adds the metadata to a 785 /// vector of instructions. 786 void addMetadata(ArrayRef<Value *> To, Instruction *From); 787 788 /// Allow subclasses to override and print debug traces before/after vplan 789 /// execution, when trace information is requested. 790 virtual void printDebugTracesAtStart(){}; 791 virtual void printDebugTracesAtEnd(){}; 792 793 /// The original loop. 794 Loop *OrigLoop; 795 796 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 797 /// dynamic knowledge to simplify SCEV expressions and converts them to a 798 /// more usable form. 799 PredicatedScalarEvolution &PSE; 800 801 /// Loop Info. 802 LoopInfo *LI; 803 804 /// Dominator Tree. 805 DominatorTree *DT; 806 807 /// Alias Analysis. 808 AAResults *AA; 809 810 /// Target Library Info. 811 const TargetLibraryInfo *TLI; 812 813 /// Target Transform Info. 814 const TargetTransformInfo *TTI; 815 816 /// Assumption Cache. 817 AssumptionCache *AC; 818 819 /// Interface to emit optimization remarks. 820 OptimizationRemarkEmitter *ORE; 821 822 /// LoopVersioning. It's only set up (non-null) if memchecks were 823 /// used. 824 /// 825 /// This is currently only used to add no-alias metadata based on the 826 /// memchecks. The actually versioning is performed manually. 827 std::unique_ptr<LoopVersioning> LVer; 828 829 /// The vectorization SIMD factor to use. Each vector will have this many 830 /// vector elements. 831 ElementCount VF; 832 833 /// The vectorization unroll factor to use. Each scalar is vectorized to this 834 /// many different vector instructions. 835 unsigned UF; 836 837 /// The builder that we use 838 IRBuilder<> Builder; 839 840 // --- Vectorization state --- 841 842 /// The vector-loop preheader. 843 BasicBlock *LoopVectorPreHeader; 844 845 /// The scalar-loop preheader. 846 BasicBlock *LoopScalarPreHeader; 847 848 /// Middle Block between the vector and the scalar. 849 BasicBlock *LoopMiddleBlock; 850 851 /// The (unique) ExitBlock of the scalar loop. Note that 852 /// there can be multiple exiting edges reaching this block. 853 BasicBlock *LoopExitBlock; 854 855 /// The vector loop body. 856 BasicBlock *LoopVectorBody; 857 858 /// The scalar loop body. 859 BasicBlock *LoopScalarBody; 860 861 /// A list of all bypass blocks. The first block is the entry of the loop. 862 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 863 864 /// The new Induction variable which was added to the new block. 865 PHINode *Induction = nullptr; 866 867 /// The induction variable of the old basic block. 868 PHINode *OldInduction = nullptr; 869 870 /// Maps values from the original loop to their corresponding values in the 871 /// vectorized loop. A key value can map to either vector values, scalar 872 /// values or both kinds of values, depending on whether the key was 873 /// vectorized and scalarized. 874 VectorizerValueMap VectorLoopValueMap; 875 876 /// Store instructions that were predicated. 877 SmallVector<Instruction *, 4> PredicatedInstructions; 878 879 /// Trip count of the original loop. 880 Value *TripCount = nullptr; 881 882 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 883 Value *VectorTripCount = nullptr; 884 885 /// The legality analysis. 886 LoopVectorizationLegality *Legal; 887 888 /// The profitablity analysis. 889 LoopVectorizationCostModel *Cost; 890 891 // Record whether runtime checks are added. 892 bool AddedSafetyChecks = false; 893 894 // Holds the end values for each induction variable. We save the end values 895 // so we can later fix-up the external users of the induction variables. 896 DenseMap<PHINode *, Value *> IVEndValues; 897 898 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 899 // fixed up at the end of vector code generation. 900 SmallVector<PHINode *, 8> OrigPHIsToFix; 901 902 /// BFI and PSI are used to check for profile guided size optimizations. 903 BlockFrequencyInfo *BFI; 904 ProfileSummaryInfo *PSI; 905 906 // Whether this loop should be optimized for size based on profile guided size 907 // optimizatios. 908 bool OptForSizeBasedOnProfile; 909 }; 910 911 class InnerLoopUnroller : public InnerLoopVectorizer { 912 public: 913 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 914 LoopInfo *LI, DominatorTree *DT, 915 const TargetLibraryInfo *TLI, 916 const TargetTransformInfo *TTI, AssumptionCache *AC, 917 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 918 LoopVectorizationLegality *LVL, 919 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 920 ProfileSummaryInfo *PSI) 921 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 922 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 923 BFI, PSI) {} 924 925 private: 926 Value *getBroadcastInstrs(Value *V) override; 927 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 928 Instruction::BinaryOps Opcode = 929 Instruction::BinaryOpsEnd) override; 930 Value *reverseVector(Value *Vec) override; 931 }; 932 933 /// Encapsulate information regarding vectorization of a loop and its epilogue. 934 /// This information is meant to be updated and used across two stages of 935 /// epilogue vectorization. 936 struct EpilogueLoopVectorizationInfo { 937 ElementCount MainLoopVF = ElementCount::getFixed(0); 938 unsigned MainLoopUF = 0; 939 ElementCount EpilogueVF = ElementCount::getFixed(0); 940 unsigned EpilogueUF = 0; 941 BasicBlock *MainLoopIterationCountCheck = nullptr; 942 BasicBlock *EpilogueIterationCountCheck = nullptr; 943 BasicBlock *SCEVSafetyCheck = nullptr; 944 BasicBlock *MemSafetyCheck = nullptr; 945 Value *TripCount = nullptr; 946 Value *VectorTripCount = nullptr; 947 948 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 949 unsigned EUF) 950 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 951 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 952 assert(EUF == 1 && 953 "A high UF for the epilogue loop is likely not beneficial."); 954 } 955 }; 956 957 /// An extension of the inner loop vectorizer that creates a skeleton for a 958 /// vectorized loop that has its epilogue (residual) also vectorized. 959 /// The idea is to run the vplan on a given loop twice, firstly to setup the 960 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 961 /// from the first step and vectorize the epilogue. This is achieved by 962 /// deriving two concrete strategy classes from this base class and invoking 963 /// them in succession from the loop vectorizer planner. 964 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 965 public: 966 InnerLoopAndEpilogueVectorizer( 967 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 DominatorTree *DT, const TargetLibraryInfo *TLI, 969 const TargetTransformInfo *TTI, AssumptionCache *AC, 970 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 971 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 972 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 973 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 974 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 975 EPI(EPI) {} 976 977 // Override this function to handle the more complex control flow around the 978 // three loops. 979 BasicBlock *createVectorizedLoopSkeleton() final override { 980 return createEpilogueVectorizedLoopSkeleton(); 981 } 982 983 /// The interface for creating a vectorized skeleton using one of two 984 /// different strategies, each corresponding to one execution of the vplan 985 /// as described above. 986 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 987 988 /// Holds and updates state information required to vectorize the main loop 989 /// and its epilogue in two separate passes. This setup helps us avoid 990 /// regenerating and recomputing runtime safety checks. It also helps us to 991 /// shorten the iteration-count-check path length for the cases where the 992 /// iteration count of the loop is so small that the main vector loop is 993 /// completely skipped. 994 EpilogueLoopVectorizationInfo &EPI; 995 }; 996 997 /// A specialized derived class of inner loop vectorizer that performs 998 /// vectorization of *main* loops in the process of vectorizing loops and their 999 /// epilogues. 1000 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerMainLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1009 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1010 EPI, LVL, CM, BFI, PSI) {} 1011 /// Implements the interface for creating a vectorized skeleton using the 1012 /// *main loop* strategy (ie the first pass of vplan execution). 1013 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1014 1015 protected: 1016 /// Emits an iteration count bypass check once for the main loop (when \p 1017 /// ForEpilogue is false) and once for the epilogue loop (when \p 1018 /// ForEpilogue is true). 1019 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1020 bool ForEpilogue); 1021 void printDebugTracesAtStart() override; 1022 void printDebugTracesAtEnd() override; 1023 }; 1024 1025 // A specialized derived class of inner loop vectorizer that performs 1026 // vectorization of *epilogue* loops in the process of vectorizing loops and 1027 // their epilogues. 1028 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1029 public: 1030 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1031 LoopInfo *LI, DominatorTree *DT, 1032 const TargetLibraryInfo *TLI, 1033 const TargetTransformInfo *TTI, AssumptionCache *AC, 1034 OptimizationRemarkEmitter *ORE, 1035 EpilogueLoopVectorizationInfo &EPI, 1036 LoopVectorizationLegality *LVL, 1037 llvm::LoopVectorizationCostModel *CM, 1038 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1039 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1040 EPI, LVL, CM, BFI, PSI) {} 1041 /// Implements the interface for creating a vectorized skeleton using the 1042 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1043 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1044 1045 protected: 1046 /// Emits an iteration count bypass check after the main vector loop has 1047 /// finished to see if there are any iterations left to execute by either 1048 /// the vector epilogue or the scalar epilogue. 1049 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1050 BasicBlock *Bypass, 1051 BasicBlock *Insert); 1052 void printDebugTracesAtStart() override; 1053 void printDebugTracesAtEnd() override; 1054 }; 1055 } // end namespace llvm 1056 1057 /// Look for a meaningful debug location on the instruction or it's 1058 /// operands. 1059 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1060 if (!I) 1061 return I; 1062 1063 DebugLoc Empty; 1064 if (I->getDebugLoc() != Empty) 1065 return I; 1066 1067 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1068 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1069 if (OpInst->getDebugLoc() != Empty) 1070 return OpInst; 1071 } 1072 1073 return I; 1074 } 1075 1076 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1077 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1078 const DILocation *DIL = Inst->getDebugLoc(); 1079 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1080 !isa<DbgInfoIntrinsic>(Inst)) { 1081 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1082 auto NewDIL = 1083 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1084 if (NewDIL) 1085 B.SetCurrentDebugLocation(NewDIL.getValue()); 1086 else 1087 LLVM_DEBUG(dbgs() 1088 << "Failed to create new discriminator: " 1089 << DIL->getFilename() << " Line: " << DIL->getLine()); 1090 } 1091 else 1092 B.SetCurrentDebugLocation(DIL); 1093 } else 1094 B.SetCurrentDebugLocation(DebugLoc()); 1095 } 1096 1097 /// Write a record \p DebugMsg about vectorization failure to the debug 1098 /// output stream. If \p I is passed, it is an instruction that prevents 1099 /// vectorization. 1100 #ifndef NDEBUG 1101 static void debugVectorizationFailure(const StringRef DebugMsg, 1102 Instruction *I) { 1103 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1104 if (I != nullptr) 1105 dbgs() << " " << *I; 1106 else 1107 dbgs() << '.'; 1108 dbgs() << '\n'; 1109 } 1110 #endif 1111 1112 /// Create an analysis remark that explains why vectorization failed 1113 /// 1114 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1115 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1116 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1117 /// the location of the remark. \return the remark object that can be 1118 /// streamed to. 1119 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1120 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1121 Value *CodeRegion = TheLoop->getHeader(); 1122 DebugLoc DL = TheLoop->getStartLoc(); 1123 1124 if (I) { 1125 CodeRegion = I->getParent(); 1126 // If there is no debug location attached to the instruction, revert back to 1127 // using the loop's. 1128 if (I->getDebugLoc()) 1129 DL = I->getDebugLoc(); 1130 } 1131 1132 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1133 R << "loop not vectorized: "; 1134 return R; 1135 } 1136 1137 /// Return a value for Step multiplied by VF. 1138 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1139 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1140 Constant *StepVal = ConstantInt::get( 1141 Step->getType(), 1142 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1143 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1144 } 1145 1146 namespace llvm { 1147 1148 void reportVectorizationFailure(const StringRef DebugMsg, 1149 const StringRef OREMsg, const StringRef ORETag, 1150 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1151 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1152 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1153 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1154 ORETag, TheLoop, I) << OREMsg); 1155 } 1156 1157 } // end namespace llvm 1158 1159 #ifndef NDEBUG 1160 /// \return string containing a file name and a line # for the given loop. 1161 static std::string getDebugLocString(const Loop *L) { 1162 std::string Result; 1163 if (L) { 1164 raw_string_ostream OS(Result); 1165 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1166 LoopDbgLoc.print(OS); 1167 else 1168 // Just print the module name. 1169 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1170 OS.flush(); 1171 } 1172 return Result; 1173 } 1174 #endif 1175 1176 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1177 const Instruction *Orig) { 1178 // If the loop was versioned with memchecks, add the corresponding no-alias 1179 // metadata. 1180 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1181 LVer->annotateInstWithNoAlias(To, Orig); 1182 } 1183 1184 void InnerLoopVectorizer::addMetadata(Instruction *To, 1185 Instruction *From) { 1186 propagateMetadata(To, From); 1187 addNewMetadata(To, From); 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1191 Instruction *From) { 1192 for (Value *V : To) { 1193 if (Instruction *I = dyn_cast<Instruction>(V)) 1194 addMetadata(I, From); 1195 } 1196 } 1197 1198 namespace llvm { 1199 1200 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1201 // lowered. 1202 enum ScalarEpilogueLowering { 1203 1204 // The default: allowing scalar epilogues. 1205 CM_ScalarEpilogueAllowed, 1206 1207 // Vectorization with OptForSize: don't allow epilogues. 1208 CM_ScalarEpilogueNotAllowedOptSize, 1209 1210 // A special case of vectorisation with OptForSize: loops with a very small 1211 // trip count are considered for vectorization under OptForSize, thereby 1212 // making sure the cost of their loop body is dominant, free of runtime 1213 // guards and scalar iteration overheads. 1214 CM_ScalarEpilogueNotAllowedLowTripLoop, 1215 1216 // Loop hint predicate indicating an epilogue is undesired. 1217 CM_ScalarEpilogueNotNeededUsePredicate, 1218 1219 // Directive indicating we must either tail fold or not vectorize 1220 CM_ScalarEpilogueNotAllowedUsePredicate 1221 }; 1222 1223 /// LoopVectorizationCostModel - estimates the expected speedups due to 1224 /// vectorization. 1225 /// In many cases vectorization is not profitable. This can happen because of 1226 /// a number of reasons. In this class we mainly attempt to predict the 1227 /// expected speedup/slowdowns due to the supported instruction set. We use the 1228 /// TargetTransformInfo to query the different backends for the cost of 1229 /// different operations. 1230 class LoopVectorizationCostModel { 1231 public: 1232 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1233 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1234 LoopVectorizationLegality *Legal, 1235 const TargetTransformInfo &TTI, 1236 const TargetLibraryInfo *TLI, DemandedBits *DB, 1237 AssumptionCache *AC, 1238 OptimizationRemarkEmitter *ORE, const Function *F, 1239 const LoopVectorizeHints *Hints, 1240 InterleavedAccessInfo &IAI) 1241 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1242 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1243 Hints(Hints), InterleaveInfo(IAI) {} 1244 1245 /// \return An upper bound for the vectorization factor, or None if 1246 /// vectorization and interleaving should be avoided up front. 1247 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1248 1249 /// \return True if runtime checks are required for vectorization, and false 1250 /// otherwise. 1251 bool runtimeChecksRequired(); 1252 1253 /// \return The most profitable vectorization factor and the cost of that VF. 1254 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1255 /// then this vectorization factor will be selected if vectorization is 1256 /// possible. 1257 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 void selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 } 1267 1268 /// \return The size (in bits) of the smallest and widest types in the code 1269 /// that needs to be vectorized. We ignore values that remain scalar such as 1270 /// 64 bit loop indices. 1271 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1272 1273 /// \return The desired interleave count. 1274 /// If interleave count has been specified by metadata it will be returned. 1275 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1276 /// are the selected vectorization factor and the cost of the selected VF. 1277 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1278 1279 /// Memory access instruction may be vectorized in more than one way. 1280 /// Form of instruction after vectorization depends on cost. 1281 /// This function takes cost-based decisions for Load/Store instructions 1282 /// and collects them in a map. This decisions map is used for building 1283 /// the lists of loop-uniform and loop-scalar instructions. 1284 /// The calculated cost is saved with widening decision in order to 1285 /// avoid redundant calculations. 1286 void setCostBasedWideningDecision(ElementCount VF); 1287 1288 /// A struct that represents some properties of the register usage 1289 /// of a loop. 1290 struct RegisterUsage { 1291 /// Holds the number of loop invariant values that are used in the loop. 1292 /// The key is ClassID of target-provided register class. 1293 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1294 /// Holds the maximum number of concurrent live intervals in the loop. 1295 /// The key is ClassID of target-provided register class. 1296 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1297 }; 1298 1299 /// \return Returns information about the register usages of the loop for the 1300 /// given vectorization factors. 1301 SmallVector<RegisterUsage, 8> 1302 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1303 1304 /// Collect values we want to ignore in the cost model. 1305 void collectValuesToIgnore(); 1306 1307 /// Split reductions into those that happen in the loop, and those that happen 1308 /// outside. In loop reductions are collected into InLoopReductionChains. 1309 void collectInLoopReductions(); 1310 1311 /// \returns The smallest bitwidth each instruction can be represented with. 1312 /// The vector equivalents of these instructions should be truncated to this 1313 /// type. 1314 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1315 return MinBWs; 1316 } 1317 1318 /// \returns True if it is more profitable to scalarize instruction \p I for 1319 /// vectorization factor \p VF. 1320 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1321 assert(VF.isVector() && 1322 "Profitable to scalarize relevant only for VF > 1."); 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto Scalars = InstsToScalarize.find(VF); 1330 assert(Scalars != InstsToScalarize.end() && 1331 "VF not yet analyzed for scalarization profitability"); 1332 return Scalars->second.find(I) != Scalars->second.end(); 1333 } 1334 1335 /// Returns true if \p I is known to be uniform after vectorization. 1336 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto UniformsPerVF = Uniforms.find(VF); 1346 assert(UniformsPerVF != Uniforms.end() && 1347 "VF not yet analyzed for uniformity"); 1348 return UniformsPerVF->second.count(I); 1349 } 1350 1351 /// Returns true if \p I is known to be scalar after vectorization. 1352 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1353 if (VF.isScalar()) 1354 return true; 1355 1356 // Cost model is not run in the VPlan-native path - return conservative 1357 // result until this changes. 1358 if (EnableVPlanNativePath) 1359 return false; 1360 1361 auto ScalarsPerVF = Scalars.find(VF); 1362 assert(ScalarsPerVF != Scalars.end() && 1363 "Scalar values are not calculated for VF"); 1364 return ScalarsPerVF->second.count(I); 1365 } 1366 1367 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1368 /// for vectorization factor \p VF. 1369 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1370 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1371 !isProfitableToScalarize(I, VF) && 1372 !isScalarAfterVectorization(I, VF); 1373 } 1374 1375 /// Decision that was taken during cost calculation for memory instruction. 1376 enum InstWidening { 1377 CM_Unknown, 1378 CM_Widen, // For consecutive accesses with stride +1. 1379 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1380 CM_Interleave, 1381 CM_GatherScatter, 1382 CM_Scalarize 1383 }; 1384 1385 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1386 /// instruction \p I and vector width \p VF. 1387 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1388 InstructionCost Cost) { 1389 assert(VF.isVector() && "Expected VF >=2"); 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 } 1392 1393 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1394 /// interleaving group \p Grp and vector width \p VF. 1395 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1396 ElementCount VF, InstWidening W, 1397 InstructionCost Cost) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 /// Broadcast this decicion to all instructions inside the group. 1400 /// But the cost will be assigned to one instruction only. 1401 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1402 if (auto *I = Grp->getMember(i)) { 1403 if (Grp->getInsertPos() == I) 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1405 else 1406 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1407 } 1408 } 1409 } 1410 1411 /// Return the cost model decision for the given instruction \p I and vector 1412 /// width \p VF. Return CM_Unknown if this instruction did not pass 1413 /// through the cost modeling. 1414 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1415 assert(VF.isVector() && "Expected VF to be a vector VF"); 1416 // Cost model is not run in the VPlan-native path - return conservative 1417 // result until this changes. 1418 if (EnableVPlanNativePath) 1419 return CM_GatherScatter; 1420 1421 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1422 auto Itr = WideningDecisions.find(InstOnVF); 1423 if (Itr == WideningDecisions.end()) 1424 return CM_Unknown; 1425 return Itr->second.first; 1426 } 1427 1428 /// Return the vectorization cost for the given instruction \p I and vector 1429 /// width \p VF. 1430 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1431 assert(VF.isVector() && "Expected VF >=2"); 1432 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1433 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1434 "The cost is not calculated"); 1435 return WideningDecisions[InstOnVF].second; 1436 } 1437 1438 /// Return True if instruction \p I is an optimizable truncate whose operand 1439 /// is an induction variable. Such a truncate will be removed by adding a new 1440 /// induction variable with the destination type. 1441 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1442 // If the instruction is not a truncate, return false. 1443 auto *Trunc = dyn_cast<TruncInst>(I); 1444 if (!Trunc) 1445 return false; 1446 1447 // Get the source and destination types of the truncate. 1448 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1449 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1450 1451 // If the truncate is free for the given types, return false. Replacing a 1452 // free truncate with an induction variable would add an induction variable 1453 // update instruction to each iteration of the loop. We exclude from this 1454 // check the primary induction variable since it will need an update 1455 // instruction regardless. 1456 Value *Op = Trunc->getOperand(0); 1457 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1458 return false; 1459 1460 // If the truncated value is not an induction variable, return false. 1461 return Legal->isInductionPhi(Op); 1462 } 1463 1464 /// Collects the instructions to scalarize for each predicated instruction in 1465 /// the loop. 1466 void collectInstsToScalarize(ElementCount VF); 1467 1468 /// Collect Uniform and Scalar values for the given \p VF. 1469 /// The sets depend on CM decision for Load/Store instructions 1470 /// that may be vectorized as interleave, gather-scatter or scalarized. 1471 void collectUniformsAndScalars(ElementCount VF) { 1472 // Do the analysis once. 1473 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1474 return; 1475 setCostBasedWideningDecision(VF); 1476 collectLoopUniforms(VF); 1477 collectLoopScalars(VF); 1478 } 1479 1480 /// Returns true if the target machine supports masked store operation 1481 /// for the given \p DataType and kind of access to \p Ptr. 1482 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1483 return Legal->isConsecutivePtr(Ptr) && 1484 TTI.isLegalMaskedStore(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked load operation 1488 /// for the given \p DataType and kind of access to \p Ptr. 1489 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1490 return Legal->isConsecutivePtr(Ptr) && 1491 TTI.isLegalMaskedLoad(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked scatter operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1497 return TTI.isLegalMaskedScatter(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked gather operation 1501 /// for the given \p DataType. 1502 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1503 return TTI.isLegalMaskedGather(DataType, Alignment); 1504 } 1505 1506 /// Returns true if the target machine can represent \p V as a masked gather 1507 /// or scatter operation. 1508 bool isLegalGatherOrScatter(Value *V) { 1509 bool LI = isa<LoadInst>(V); 1510 bool SI = isa<StoreInst>(V); 1511 if (!LI && !SI) 1512 return false; 1513 auto *Ty = getMemInstValueType(V); 1514 Align Align = getLoadStoreAlignment(V); 1515 return (LI && isLegalMaskedGather(Ty, Align)) || 1516 (SI && isLegalMaskedScatter(Ty, Align)); 1517 } 1518 1519 /// Returns true if \p I is an instruction that will be scalarized with 1520 /// predication. Such instructions include conditional stores and 1521 /// instructions that may divide by zero. 1522 /// If a non-zero VF has been calculated, we check if I will be scalarized 1523 /// predication for that VF. 1524 bool isScalarWithPredication(Instruction *I, 1525 ElementCount VF = ElementCount::getFixed(1)); 1526 1527 // Returns true if \p I is an instruction that will be predicated either 1528 // through scalar predication or masked load/store or masked gather/scatter. 1529 // Superset of instructions that return true for isScalarWithPredication. 1530 bool isPredicatedInst(Instruction *I) { 1531 if (!blockNeedsPredication(I->getParent())) 1532 return false; 1533 // Loads and stores that need some form of masked operation are predicated 1534 // instructions. 1535 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1536 return Legal->isMaskRequired(I); 1537 return isScalarWithPredication(I); 1538 } 1539 1540 /// Returns true if \p I is a memory instruction with consecutive memory 1541 /// access that can be widened. 1542 bool 1543 memoryInstructionCanBeWidened(Instruction *I, 1544 ElementCount VF = ElementCount::getFixed(1)); 1545 1546 /// Returns true if \p I is a memory instruction in an interleaved-group 1547 /// of memory accesses that can be vectorized with wide vector loads/stores 1548 /// and shuffles. 1549 bool 1550 interleavedAccessCanBeWidened(Instruction *I, 1551 ElementCount VF = ElementCount::getFixed(1)); 1552 1553 /// Check if \p Instr belongs to any interleaved access group. 1554 bool isAccessInterleaved(Instruction *Instr) { 1555 return InterleaveInfo.isInterleaved(Instr); 1556 } 1557 1558 /// Get the interleaved access group that \p Instr belongs to. 1559 const InterleaveGroup<Instruction> * 1560 getInterleavedAccessGroup(Instruction *Instr) { 1561 return InterleaveInfo.getInterleaveGroup(Instr); 1562 } 1563 1564 /// Returns true if we're required to use a scalar epilogue for at least 1565 /// the final iteration of the original loop. 1566 bool requiresScalarEpilogue() const { 1567 if (!isScalarEpilogueAllowed()) 1568 return false; 1569 // If we might exit from anywhere but the latch, must run the exiting 1570 // iteration in scalar form. 1571 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1572 return true; 1573 return InterleaveInfo.requiresScalarEpilogue(); 1574 } 1575 1576 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1577 /// loop hint annotation. 1578 bool isScalarEpilogueAllowed() const { 1579 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1580 } 1581 1582 /// Returns true if all loop blocks should be masked to fold tail loop. 1583 bool foldTailByMasking() const { return FoldTailByMasking; } 1584 1585 bool blockNeedsPredication(BasicBlock *BB) { 1586 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1587 } 1588 1589 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1590 /// nodes to the chain of instructions representing the reductions. Uses a 1591 /// MapVector to ensure deterministic iteration order. 1592 using ReductionChainMap = 1593 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1594 1595 /// Return the chain of instructions representing an inloop reduction. 1596 const ReductionChainMap &getInLoopReductionChains() const { 1597 return InLoopReductionChains; 1598 } 1599 1600 /// Returns true if the Phi is part of an inloop reduction. 1601 bool isInLoopReduction(PHINode *Phi) const { 1602 return InLoopReductionChains.count(Phi); 1603 } 1604 1605 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1606 /// with factor VF. Return the cost of the instruction, including 1607 /// scalarization overhead if it's needed. 1608 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1609 1610 /// Estimate cost of a call instruction CI if it were vectorized with factor 1611 /// VF. Return the cost of the instruction, including scalarization overhead 1612 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1613 /// scalarized - 1614 /// i.e. either vector version isn't available, or is too expensive. 1615 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1616 bool &NeedToScalarize); 1617 1618 /// Invalidates decisions already taken by the cost model. 1619 void invalidateCostModelingDecisions() { 1620 WideningDecisions.clear(); 1621 Uniforms.clear(); 1622 Scalars.clear(); 1623 } 1624 1625 private: 1626 unsigned NumPredStores = 0; 1627 1628 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1629 /// than zero. One is returned if vectorization should best be avoided due 1630 /// to cost. 1631 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1632 ElementCount UserVF); 1633 1634 /// The vectorization cost is a combination of the cost itself and a boolean 1635 /// indicating whether any of the contributing operations will actually 1636 /// operate on 1637 /// vector values after type legalization in the backend. If this latter value 1638 /// is 1639 /// false, then all operations will be scalarized (i.e. no vectorization has 1640 /// actually taken place). 1641 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1642 1643 /// Returns the expected execution cost. The unit of the cost does 1644 /// not matter because we use the 'cost' units to compare different 1645 /// vector widths. The cost that is returned is *not* normalized by 1646 /// the factor width. 1647 VectorizationCostTy expectedCost(ElementCount VF); 1648 1649 /// Returns the execution time cost of an instruction for a given vector 1650 /// width. Vector width of one means scalar. 1651 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost-computation logic from getInstructionCost which provides 1654 /// the vector type as an output parameter. 1655 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1656 Type *&VectorTy); 1657 1658 /// Calculate vectorization cost of memory instruction \p I. 1659 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost computation for scalarized memory instruction. 1662 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1663 1664 /// The cost computation for interleaving group of memory instructions. 1665 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost computation for Gather/Scatter instruction. 1668 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost computation for widening instruction \p I with consecutive 1671 /// memory access. 1672 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1673 1674 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1675 /// Load: scalar load + broadcast. 1676 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1677 /// element) 1678 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1679 1680 /// Estimate the overhead of scalarizing an instruction. This is a 1681 /// convenience wrapper for the type-based getScalarizationOverhead API. 1682 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1683 1684 /// Returns whether the instruction is a load or store and will be a emitted 1685 /// as a vector operation. 1686 bool isConsecutiveLoadOrStore(Instruction *I); 1687 1688 /// Returns true if an artificially high cost for emulated masked memrefs 1689 /// should be used. 1690 bool useEmulatedMaskMemRefHack(Instruction *I); 1691 1692 /// Map of scalar integer values to the smallest bitwidth they can be legally 1693 /// represented as. The vector equivalents of these values should be truncated 1694 /// to this type. 1695 MapVector<Instruction *, uint64_t> MinBWs; 1696 1697 /// A type representing the costs for instructions if they were to be 1698 /// scalarized rather than vectorized. The entries are Instruction-Cost 1699 /// pairs. 1700 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1701 1702 /// A set containing all BasicBlocks that are known to present after 1703 /// vectorization as a predicated block. 1704 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1705 1706 /// Records whether it is allowed to have the original scalar loop execute at 1707 /// least once. This may be needed as a fallback loop in case runtime 1708 /// aliasing/dependence checks fail, or to handle the tail/remainder 1709 /// iterations when the trip count is unknown or doesn't divide by the VF, 1710 /// or as a peel-loop to handle gaps in interleave-groups. 1711 /// Under optsize and when the trip count is very small we don't allow any 1712 /// iterations to execute in the scalar loop. 1713 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1714 1715 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1716 bool FoldTailByMasking = false; 1717 1718 /// A map holding scalar costs for different vectorization factors. The 1719 /// presence of a cost for an instruction in the mapping indicates that the 1720 /// instruction will be scalarized when vectorizing with the associated 1721 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1722 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1723 1724 /// Holds the instructions known to be uniform after vectorization. 1725 /// The data is collected per VF. 1726 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1727 1728 /// Holds the instructions known to be scalar after vectorization. 1729 /// The data is collected per VF. 1730 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1731 1732 /// Holds the instructions (address computations) that are forced to be 1733 /// scalarized. 1734 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1735 1736 /// PHINodes of the reductions that should be expanded in-loop along with 1737 /// their associated chains of reduction operations, in program order from top 1738 /// (PHI) to bottom 1739 ReductionChainMap InLoopReductionChains; 1740 1741 /// Returns the expected difference in cost from scalarizing the expression 1742 /// feeding a predicated instruction \p PredInst. The instructions to 1743 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1744 /// non-negative return value implies the expression will be scalarized. 1745 /// Currently, only single-use chains are considered for scalarization. 1746 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1747 ElementCount VF); 1748 1749 /// Collect the instructions that are uniform after vectorization. An 1750 /// instruction is uniform if we represent it with a single scalar value in 1751 /// the vectorized loop corresponding to each vector iteration. Examples of 1752 /// uniform instructions include pointer operands of consecutive or 1753 /// interleaved memory accesses. Note that although uniformity implies an 1754 /// instruction will be scalar, the reverse is not true. In general, a 1755 /// scalarized instruction will be represented by VF scalar values in the 1756 /// vectorized loop, each corresponding to an iteration of the original 1757 /// scalar loop. 1758 void collectLoopUniforms(ElementCount VF); 1759 1760 /// Collect the instructions that are scalar after vectorization. An 1761 /// instruction is scalar if it is known to be uniform or will be scalarized 1762 /// during vectorization. Non-uniform scalarized instructions will be 1763 /// represented by VF values in the vectorized loop, each corresponding to an 1764 /// iteration of the original scalar loop. 1765 void collectLoopScalars(ElementCount VF); 1766 1767 /// Keeps cost model vectorization decision and cost for instructions. 1768 /// Right now it is used for memory instructions only. 1769 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1770 std::pair<InstWidening, InstructionCost>>; 1771 1772 DecisionList WideningDecisions; 1773 1774 /// Returns true if \p V is expected to be vectorized and it needs to be 1775 /// extracted. 1776 bool needsExtract(Value *V, ElementCount VF) const { 1777 Instruction *I = dyn_cast<Instruction>(V); 1778 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1779 TheLoop->isLoopInvariant(I)) 1780 return false; 1781 1782 // Assume we can vectorize V (and hence we need extraction) if the 1783 // scalars are not computed yet. This can happen, because it is called 1784 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1785 // the scalars are collected. That should be a safe assumption in most 1786 // cases, because we check if the operands have vectorizable types 1787 // beforehand in LoopVectorizationLegality. 1788 return Scalars.find(VF) == Scalars.end() || 1789 !isScalarAfterVectorization(I, VF); 1790 }; 1791 1792 /// Returns a range containing only operands needing to be extracted. 1793 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1794 ElementCount VF) { 1795 return SmallVector<Value *, 4>(make_filter_range( 1796 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1797 } 1798 1799 /// Determines if we have the infrastructure to vectorize loop \p L and its 1800 /// epilogue, assuming the main loop is vectorized by \p VF. 1801 bool isCandidateForEpilogueVectorization(const Loop &L, 1802 const ElementCount VF) const; 1803 1804 /// Returns true if epilogue vectorization is considered profitable, and 1805 /// false otherwise. 1806 /// \p VF is the vectorization factor chosen for the original loop. 1807 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1808 1809 public: 1810 /// The loop that we evaluate. 1811 Loop *TheLoop; 1812 1813 /// Predicated scalar evolution analysis. 1814 PredicatedScalarEvolution &PSE; 1815 1816 /// Loop Info analysis. 1817 LoopInfo *LI; 1818 1819 /// Vectorization legality. 1820 LoopVectorizationLegality *Legal; 1821 1822 /// Vector target information. 1823 const TargetTransformInfo &TTI; 1824 1825 /// Target Library Info. 1826 const TargetLibraryInfo *TLI; 1827 1828 /// Demanded bits analysis. 1829 DemandedBits *DB; 1830 1831 /// Assumption cache. 1832 AssumptionCache *AC; 1833 1834 /// Interface to emit optimization remarks. 1835 OptimizationRemarkEmitter *ORE; 1836 1837 const Function *TheFunction; 1838 1839 /// Loop Vectorize Hint. 1840 const LoopVectorizeHints *Hints; 1841 1842 /// The interleave access information contains groups of interleaved accesses 1843 /// with the same stride and close to each other. 1844 InterleavedAccessInfo &InterleaveInfo; 1845 1846 /// Values to ignore in the cost model. 1847 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1848 1849 /// Values to ignore in the cost model when VF > 1. 1850 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1851 1852 /// Profitable vector factors. 1853 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1854 }; 1855 1856 } // end namespace llvm 1857 1858 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1859 // vectorization. The loop needs to be annotated with #pragma omp simd 1860 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1861 // vector length information is not provided, vectorization is not considered 1862 // explicit. Interleave hints are not allowed either. These limitations will be 1863 // relaxed in the future. 1864 // Please, note that we are currently forced to abuse the pragma 'clang 1865 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1866 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1867 // provides *explicit vectorization hints* (LV can bypass legal checks and 1868 // assume that vectorization is legal). However, both hints are implemented 1869 // using the same metadata (llvm.loop.vectorize, processed by 1870 // LoopVectorizeHints). This will be fixed in the future when the native IR 1871 // representation for pragma 'omp simd' is introduced. 1872 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1873 OptimizationRemarkEmitter *ORE) { 1874 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1875 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1876 1877 // Only outer loops with an explicit vectorization hint are supported. 1878 // Unannotated outer loops are ignored. 1879 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1880 return false; 1881 1882 Function *Fn = OuterLp->getHeader()->getParent(); 1883 if (!Hints.allowVectorization(Fn, OuterLp, 1884 true /*VectorizeOnlyWhenForced*/)) { 1885 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1886 return false; 1887 } 1888 1889 if (Hints.getInterleave() > 1) { 1890 // TODO: Interleave support is future work. 1891 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1892 "outer loops.\n"); 1893 Hints.emitRemarkWithHints(); 1894 return false; 1895 } 1896 1897 return true; 1898 } 1899 1900 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1901 OptimizationRemarkEmitter *ORE, 1902 SmallVectorImpl<Loop *> &V) { 1903 // Collect inner loops and outer loops without irreducible control flow. For 1904 // now, only collect outer loops that have explicit vectorization hints. If we 1905 // are stress testing the VPlan H-CFG construction, we collect the outermost 1906 // loop of every loop nest. 1907 if (L.isInnermost() || VPlanBuildStressTest || 1908 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1909 LoopBlocksRPO RPOT(&L); 1910 RPOT.perform(LI); 1911 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1912 V.push_back(&L); 1913 // TODO: Collect inner loops inside marked outer loops in case 1914 // vectorization fails for the outer loop. Do not invoke 1915 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1916 // already known to be reducible. We can use an inherited attribute for 1917 // that. 1918 return; 1919 } 1920 } 1921 for (Loop *InnerL : L) 1922 collectSupportedLoops(*InnerL, LI, ORE, V); 1923 } 1924 1925 namespace { 1926 1927 /// The LoopVectorize Pass. 1928 struct LoopVectorize : public FunctionPass { 1929 /// Pass identification, replacement for typeid 1930 static char ID; 1931 1932 LoopVectorizePass Impl; 1933 1934 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1935 bool VectorizeOnlyWhenForced = false) 1936 : FunctionPass(ID), 1937 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1938 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1939 } 1940 1941 bool runOnFunction(Function &F) override { 1942 if (skipFunction(F)) 1943 return false; 1944 1945 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1946 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1947 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1948 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1949 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1950 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1951 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1952 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1953 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1954 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1955 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1956 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1957 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1958 1959 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1960 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1961 1962 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1963 GetLAA, *ORE, PSI).MadeAnyChange; 1964 } 1965 1966 void getAnalysisUsage(AnalysisUsage &AU) const override { 1967 AU.addRequired<AssumptionCacheTracker>(); 1968 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1969 AU.addRequired<DominatorTreeWrapperPass>(); 1970 AU.addRequired<LoopInfoWrapperPass>(); 1971 AU.addRequired<ScalarEvolutionWrapperPass>(); 1972 AU.addRequired<TargetTransformInfoWrapperPass>(); 1973 AU.addRequired<AAResultsWrapperPass>(); 1974 AU.addRequired<LoopAccessLegacyAnalysis>(); 1975 AU.addRequired<DemandedBitsWrapperPass>(); 1976 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1977 AU.addRequired<InjectTLIMappingsLegacy>(); 1978 1979 // We currently do not preserve loopinfo/dominator analyses with outer loop 1980 // vectorization. Until this is addressed, mark these analyses as preserved 1981 // only for non-VPlan-native path. 1982 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1983 if (!EnableVPlanNativePath) { 1984 AU.addPreserved<LoopInfoWrapperPass>(); 1985 AU.addPreserved<DominatorTreeWrapperPass>(); 1986 } 1987 1988 AU.addPreserved<BasicAAWrapperPass>(); 1989 AU.addPreserved<GlobalsAAWrapperPass>(); 1990 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1991 } 1992 }; 1993 1994 } // end anonymous namespace 1995 1996 //===----------------------------------------------------------------------===// 1997 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1998 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1999 //===----------------------------------------------------------------------===// 2000 2001 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2002 // We need to place the broadcast of invariant variables outside the loop, 2003 // but only if it's proven safe to do so. Else, broadcast will be inside 2004 // vector loop body. 2005 Instruction *Instr = dyn_cast<Instruction>(V); 2006 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2007 (!Instr || 2008 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2009 // Place the code for broadcasting invariant variables in the new preheader. 2010 IRBuilder<>::InsertPointGuard Guard(Builder); 2011 if (SafeToHoist) 2012 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2013 2014 // Broadcast the scalar into all locations in the vector. 2015 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2016 2017 return Shuf; 2018 } 2019 2020 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2021 const InductionDescriptor &II, Value *Step, Value *Start, 2022 Instruction *EntryVal) { 2023 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2024 "Expected either an induction phi-node or a truncate of it!"); 2025 2026 // Construct the initial value of the vector IV in the vector loop preheader 2027 auto CurrIP = Builder.saveIP(); 2028 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2029 if (isa<TruncInst>(EntryVal)) { 2030 assert(Start->getType()->isIntegerTy() && 2031 "Truncation requires an integer type"); 2032 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2033 Step = Builder.CreateTrunc(Step, TruncType); 2034 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2035 } 2036 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2037 Value *SteppedStart = 2038 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2039 2040 // We create vector phi nodes for both integer and floating-point induction 2041 // variables. Here, we determine the kind of arithmetic we will perform. 2042 Instruction::BinaryOps AddOp; 2043 Instruction::BinaryOps MulOp; 2044 if (Step->getType()->isIntegerTy()) { 2045 AddOp = Instruction::Add; 2046 MulOp = Instruction::Mul; 2047 } else { 2048 AddOp = II.getInductionOpcode(); 2049 MulOp = Instruction::FMul; 2050 } 2051 2052 // Multiply the vectorization factor by the step using integer or 2053 // floating-point arithmetic as appropriate. 2054 Value *ConstVF = 2055 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2056 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2057 2058 // Create a vector splat to use in the induction update. 2059 // 2060 // FIXME: If the step is non-constant, we create the vector splat with 2061 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2062 // handle a constant vector splat. 2063 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2064 Value *SplatVF = isa<Constant>(Mul) 2065 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2066 : Builder.CreateVectorSplat(VF, Mul); 2067 Builder.restoreIP(CurrIP); 2068 2069 // We may need to add the step a number of times, depending on the unroll 2070 // factor. The last of those goes into the PHI. 2071 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2072 &*LoopVectorBody->getFirstInsertionPt()); 2073 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2074 Instruction *LastInduction = VecInd; 2075 for (unsigned Part = 0; Part < UF; ++Part) { 2076 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2077 2078 if (isa<TruncInst>(EntryVal)) 2079 addMetadata(LastInduction, EntryVal); 2080 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2081 2082 LastInduction = cast<Instruction>(addFastMathFlag( 2083 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2084 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2085 } 2086 2087 // Move the last step to the end of the latch block. This ensures consistent 2088 // placement of all induction updates. 2089 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2090 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2091 auto *ICmp = cast<Instruction>(Br->getCondition()); 2092 LastInduction->moveBefore(ICmp); 2093 LastInduction->setName("vec.ind.next"); 2094 2095 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2096 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2097 } 2098 2099 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2100 return Cost->isScalarAfterVectorization(I, VF) || 2101 Cost->isProfitableToScalarize(I, VF); 2102 } 2103 2104 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2105 if (shouldScalarizeInstruction(IV)) 2106 return true; 2107 auto isScalarInst = [&](User *U) -> bool { 2108 auto *I = cast<Instruction>(U); 2109 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2110 }; 2111 return llvm::any_of(IV->users(), isScalarInst); 2112 } 2113 2114 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2115 const InductionDescriptor &ID, const Instruction *EntryVal, 2116 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2117 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2118 "Expected either an induction phi-node or a truncate of it!"); 2119 2120 // This induction variable is not the phi from the original loop but the 2121 // newly-created IV based on the proof that casted Phi is equal to the 2122 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2123 // re-uses the same InductionDescriptor that original IV uses but we don't 2124 // have to do any recording in this case - that is done when original IV is 2125 // processed. 2126 if (isa<TruncInst>(EntryVal)) 2127 return; 2128 2129 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2130 if (Casts.empty()) 2131 return; 2132 // Only the first Cast instruction in the Casts vector is of interest. 2133 // The rest of the Casts (if exist) have no uses outside the 2134 // induction update chain itself. 2135 Instruction *CastInst = *Casts.begin(); 2136 if (Lane < UINT_MAX) 2137 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2138 else 2139 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2140 } 2141 2142 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2143 TruncInst *Trunc) { 2144 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2145 "Primary induction variable must have an integer type"); 2146 2147 auto II = Legal->getInductionVars().find(IV); 2148 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2149 2150 auto ID = II->second; 2151 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2152 2153 // The value from the original loop to which we are mapping the new induction 2154 // variable. 2155 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2156 2157 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2158 2159 // Generate code for the induction step. Note that induction steps are 2160 // required to be loop-invariant 2161 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2162 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2163 "Induction step should be loop invariant"); 2164 if (PSE.getSE()->isSCEVable(IV->getType())) { 2165 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2166 return Exp.expandCodeFor(Step, Step->getType(), 2167 LoopVectorPreHeader->getTerminator()); 2168 } 2169 return cast<SCEVUnknown>(Step)->getValue(); 2170 }; 2171 2172 // The scalar value to broadcast. This is derived from the canonical 2173 // induction variable. If a truncation type is given, truncate the canonical 2174 // induction variable and step. Otherwise, derive these values from the 2175 // induction descriptor. 2176 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2177 Value *ScalarIV = Induction; 2178 if (IV != OldInduction) { 2179 ScalarIV = IV->getType()->isIntegerTy() 2180 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2181 : Builder.CreateCast(Instruction::SIToFP, Induction, 2182 IV->getType()); 2183 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2184 ScalarIV->setName("offset.idx"); 2185 } 2186 if (Trunc) { 2187 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2188 assert(Step->getType()->isIntegerTy() && 2189 "Truncation requires an integer step"); 2190 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2191 Step = Builder.CreateTrunc(Step, TruncType); 2192 } 2193 return ScalarIV; 2194 }; 2195 2196 // Create the vector values from the scalar IV, in the absence of creating a 2197 // vector IV. 2198 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2199 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2200 for (unsigned Part = 0; Part < UF; ++Part) { 2201 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2202 Value *EntryPart = 2203 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2204 ID.getInductionOpcode()); 2205 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2206 if (Trunc) 2207 addMetadata(EntryPart, Trunc); 2208 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2209 } 2210 }; 2211 2212 // Now do the actual transformations, and start with creating the step value. 2213 Value *Step = CreateStepValue(ID.getStep()); 2214 if (VF.isZero() || VF.isScalar()) { 2215 Value *ScalarIV = CreateScalarIV(Step); 2216 CreateSplatIV(ScalarIV, Step); 2217 return; 2218 } 2219 2220 // Determine if we want a scalar version of the induction variable. This is 2221 // true if the induction variable itself is not widened, or if it has at 2222 // least one user in the loop that is not widened. 2223 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2224 if (!NeedsScalarIV) { 2225 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2226 return; 2227 } 2228 2229 // Try to create a new independent vector induction variable. If we can't 2230 // create the phi node, we will splat the scalar induction variable in each 2231 // loop iteration. 2232 if (!shouldScalarizeInstruction(EntryVal)) { 2233 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2234 Value *ScalarIV = CreateScalarIV(Step); 2235 // Create scalar steps that can be used by instructions we will later 2236 // scalarize. Note that the addition of the scalar steps will not increase 2237 // the number of instructions in the loop in the common case prior to 2238 // InstCombine. We will be trading one vector extract for each scalar step. 2239 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2240 return; 2241 } 2242 2243 // All IV users are scalar instructions, so only emit a scalar IV, not a 2244 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2245 // predicate used by the masked loads/stores. 2246 Value *ScalarIV = CreateScalarIV(Step); 2247 if (!Cost->isScalarEpilogueAllowed()) 2248 CreateSplatIV(ScalarIV, Step); 2249 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2250 } 2251 2252 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2253 Instruction::BinaryOps BinOp) { 2254 // Create and check the types. 2255 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2256 int VLen = ValVTy->getNumElements(); 2257 2258 Type *STy = Val->getType()->getScalarType(); 2259 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2260 "Induction Step must be an integer or FP"); 2261 assert(Step->getType() == STy && "Step has wrong type"); 2262 2263 SmallVector<Constant *, 8> Indices; 2264 2265 if (STy->isIntegerTy()) { 2266 // Create a vector of consecutive numbers from zero to VF. 2267 for (int i = 0; i < VLen; ++i) 2268 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2269 2270 // Add the consecutive indices to the vector value. 2271 Constant *Cv = ConstantVector::get(Indices); 2272 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2273 Step = Builder.CreateVectorSplat(VLen, Step); 2274 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2275 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2276 // which can be found from the original scalar operations. 2277 Step = Builder.CreateMul(Cv, Step); 2278 return Builder.CreateAdd(Val, Step, "induction"); 2279 } 2280 2281 // Floating point induction. 2282 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2283 "Binary Opcode should be specified for FP induction"); 2284 // Create a vector of consecutive numbers from zero to VF. 2285 for (int i = 0; i < VLen; ++i) 2286 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2287 2288 // Add the consecutive indices to the vector value. 2289 Constant *Cv = ConstantVector::get(Indices); 2290 2291 Step = Builder.CreateVectorSplat(VLen, Step); 2292 2293 // Floating point operations had to be 'fast' to enable the induction. 2294 FastMathFlags Flags; 2295 Flags.setFast(); 2296 2297 Value *MulOp = Builder.CreateFMul(Cv, Step); 2298 if (isa<Instruction>(MulOp)) 2299 // Have to check, MulOp may be a constant 2300 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2301 2302 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2303 if (isa<Instruction>(BOp)) 2304 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2305 return BOp; 2306 } 2307 2308 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2309 Instruction *EntryVal, 2310 const InductionDescriptor &ID) { 2311 // We shouldn't have to build scalar steps if we aren't vectorizing. 2312 assert(VF.isVector() && "VF should be greater than one"); 2313 // Get the value type and ensure it and the step have the same integer type. 2314 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2315 assert(ScalarIVTy == Step->getType() && 2316 "Val and Step should have the same type"); 2317 2318 // We build scalar steps for both integer and floating-point induction 2319 // variables. Here, we determine the kind of arithmetic we will perform. 2320 Instruction::BinaryOps AddOp; 2321 Instruction::BinaryOps MulOp; 2322 if (ScalarIVTy->isIntegerTy()) { 2323 AddOp = Instruction::Add; 2324 MulOp = Instruction::Mul; 2325 } else { 2326 AddOp = ID.getInductionOpcode(); 2327 MulOp = Instruction::FMul; 2328 } 2329 2330 // Determine the number of scalars we need to generate for each unroll 2331 // iteration. If EntryVal is uniform, we only need to generate the first 2332 // lane. Otherwise, we generate all VF values. 2333 unsigned Lanes = 2334 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2335 ? 1 2336 : VF.getKnownMinValue(); 2337 assert((!VF.isScalable() || Lanes == 1) && 2338 "Should never scalarize a scalable vector"); 2339 // Compute the scalar steps and save the results in VectorLoopValueMap. 2340 for (unsigned Part = 0; Part < UF; ++Part) { 2341 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2342 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2343 ScalarIVTy->getScalarSizeInBits()); 2344 Value *StartIdx = 2345 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2346 if (ScalarIVTy->isFloatingPointTy()) 2347 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2348 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2349 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2350 // The step returned by `createStepForVF` is a runtime-evaluated value 2351 // when VF is scalable. Otherwise, it should be folded into a Constant. 2352 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2353 "Expected StartIdx to be folded to a constant when VF is not " 2354 "scalable"); 2355 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2356 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2357 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2358 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2359 } 2360 } 2361 } 2362 2363 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2364 assert(V != Induction && "The new induction variable should not be used."); 2365 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2366 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2367 2368 // If we have a stride that is replaced by one, do it here. Defer this for 2369 // the VPlan-native path until we start running Legal checks in that path. 2370 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2371 V = ConstantInt::get(V->getType(), 1); 2372 2373 // If we have a vector mapped to this value, return it. 2374 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2375 return VectorLoopValueMap.getVectorValue(V, Part); 2376 2377 // If the value has not been vectorized, check if it has been scalarized 2378 // instead. If it has been scalarized, and we actually need the value in 2379 // vector form, we will construct the vector values on demand. 2380 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2381 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2382 2383 // If we've scalarized a value, that value should be an instruction. 2384 auto *I = cast<Instruction>(V); 2385 2386 // If we aren't vectorizing, we can just copy the scalar map values over to 2387 // the vector map. 2388 if (VF.isScalar()) { 2389 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2390 return ScalarValue; 2391 } 2392 2393 // Get the last scalar instruction we generated for V and Part. If the value 2394 // is known to be uniform after vectorization, this corresponds to lane zero 2395 // of the Part unroll iteration. Otherwise, the last instruction is the one 2396 // we created for the last vector lane of the Part unroll iteration. 2397 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2398 ? 0 2399 : VF.getKnownMinValue() - 1; 2400 assert((!VF.isScalable() || LastLane == 0) && 2401 "Scalable vectorization can't lead to any scalarized values."); 2402 auto *LastInst = cast<Instruction>( 2403 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2404 2405 // Set the insert point after the last scalarized instruction. This ensures 2406 // the insertelement sequence will directly follow the scalar definitions. 2407 auto OldIP = Builder.saveIP(); 2408 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2409 Builder.SetInsertPoint(&*NewIP); 2410 2411 // However, if we are vectorizing, we need to construct the vector values. 2412 // If the value is known to be uniform after vectorization, we can just 2413 // broadcast the scalar value corresponding to lane zero for each unroll 2414 // iteration. Otherwise, we construct the vector values using insertelement 2415 // instructions. Since the resulting vectors are stored in 2416 // VectorLoopValueMap, we will only generate the insertelements once. 2417 Value *VectorValue = nullptr; 2418 if (Cost->isUniformAfterVectorization(I, VF)) { 2419 VectorValue = getBroadcastInstrs(ScalarValue); 2420 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2421 } else { 2422 // Initialize packing with insertelements to start from poison. 2423 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2424 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2425 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2426 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2427 packScalarIntoVectorValue(V, {Part, Lane}); 2428 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2429 } 2430 Builder.restoreIP(OldIP); 2431 return VectorValue; 2432 } 2433 2434 // If this scalar is unknown, assume that it is a constant or that it is 2435 // loop invariant. Broadcast V and save the value for future uses. 2436 Value *B = getBroadcastInstrs(V); 2437 VectorLoopValueMap.setVectorValue(V, Part, B); 2438 return B; 2439 } 2440 2441 Value * 2442 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2443 const VPIteration &Instance) { 2444 // If the value is not an instruction contained in the loop, it should 2445 // already be scalar. 2446 if (OrigLoop->isLoopInvariant(V)) 2447 return V; 2448 2449 assert(Instance.Lane > 0 2450 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2451 : true && "Uniform values only have lane zero"); 2452 2453 // If the value from the original loop has not been vectorized, it is 2454 // represented by UF x VF scalar values in the new loop. Return the requested 2455 // scalar value. 2456 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2457 return VectorLoopValueMap.getScalarValue(V, Instance); 2458 2459 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2460 // for the given unroll part. If this entry is not a vector type (i.e., the 2461 // vectorization factor is one), there is no need to generate an 2462 // extractelement instruction. 2463 auto *U = getOrCreateVectorValue(V, Instance.Part); 2464 if (!U->getType()->isVectorTy()) { 2465 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2466 return U; 2467 } 2468 2469 // Otherwise, the value from the original loop has been vectorized and is 2470 // represented by UF vector values. Extract and return the requested scalar 2471 // value from the appropriate vector lane. 2472 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2473 } 2474 2475 void InnerLoopVectorizer::packScalarIntoVectorValue( 2476 Value *V, const VPIteration &Instance) { 2477 assert(V != Induction && "The new induction variable should not be used."); 2478 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2479 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2480 2481 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2482 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2483 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2484 Builder.getInt32(Instance.Lane)); 2485 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2486 } 2487 2488 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2489 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2490 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2491 SmallVector<int, 8> ShuffleMask; 2492 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2493 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2494 2495 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2496 } 2497 2498 // Return whether we allow using masked interleave-groups (for dealing with 2499 // strided loads/stores that reside in predicated blocks, or for dealing 2500 // with gaps). 2501 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2502 // If an override option has been passed in for interleaved accesses, use it. 2503 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2504 return EnableMaskedInterleavedMemAccesses; 2505 2506 return TTI.enableMaskedInterleavedAccessVectorization(); 2507 } 2508 2509 // Try to vectorize the interleave group that \p Instr belongs to. 2510 // 2511 // E.g. Translate following interleaved load group (factor = 3): 2512 // for (i = 0; i < N; i+=3) { 2513 // R = Pic[i]; // Member of index 0 2514 // G = Pic[i+1]; // Member of index 1 2515 // B = Pic[i+2]; // Member of index 2 2516 // ... // do something to R, G, B 2517 // } 2518 // To: 2519 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2520 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2521 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2522 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2523 // 2524 // Or translate following interleaved store group (factor = 3): 2525 // for (i = 0; i < N; i+=3) { 2526 // ... do something to R, G, B 2527 // Pic[i] = R; // Member of index 0 2528 // Pic[i+1] = G; // Member of index 1 2529 // Pic[i+2] = B; // Member of index 2 2530 // } 2531 // To: 2532 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2533 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2534 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2535 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2536 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2537 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2538 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2539 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2540 VPValue *BlockInMask) { 2541 Instruction *Instr = Group->getInsertPos(); 2542 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2543 2544 // Prepare for the vector type of the interleaved load/store. 2545 Type *ScalarTy = getMemInstValueType(Instr); 2546 unsigned InterleaveFactor = Group->getFactor(); 2547 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2548 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2549 2550 // Prepare for the new pointers. 2551 SmallVector<Value *, 2> AddrParts; 2552 unsigned Index = Group->getIndex(Instr); 2553 2554 // TODO: extend the masked interleaved-group support to reversed access. 2555 assert((!BlockInMask || !Group->isReverse()) && 2556 "Reversed masked interleave-group not supported."); 2557 2558 // If the group is reverse, adjust the index to refer to the last vector lane 2559 // instead of the first. We adjust the index from the first vector lane, 2560 // rather than directly getting the pointer for lane VF - 1, because the 2561 // pointer operand of the interleaved access is supposed to be uniform. For 2562 // uniform instructions, we're only required to generate a value for the 2563 // first vector lane in each unroll iteration. 2564 assert(!VF.isScalable() && 2565 "scalable vector reverse operation is not implemented"); 2566 if (Group->isReverse()) 2567 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2568 2569 for (unsigned Part = 0; Part < UF; Part++) { 2570 Value *AddrPart = State.get(Addr, {Part, 0}); 2571 setDebugLocFromInst(Builder, AddrPart); 2572 2573 // Notice current instruction could be any index. Need to adjust the address 2574 // to the member of index 0. 2575 // 2576 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2577 // b = A[i]; // Member of index 0 2578 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2579 // 2580 // E.g. A[i+1] = a; // Member of index 1 2581 // A[i] = b; // Member of index 0 2582 // A[i+2] = c; // Member of index 2 (Current instruction) 2583 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2584 2585 bool InBounds = false; 2586 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2587 InBounds = gep->isInBounds(); 2588 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2589 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2590 2591 // Cast to the vector pointer type. 2592 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2593 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2594 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2595 } 2596 2597 setDebugLocFromInst(Builder, Instr); 2598 Value *PoisonVec = PoisonValue::get(VecTy); 2599 2600 Value *MaskForGaps = nullptr; 2601 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2602 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2603 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2604 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2605 } 2606 2607 // Vectorize the interleaved load group. 2608 if (isa<LoadInst>(Instr)) { 2609 // For each unroll part, create a wide load for the group. 2610 SmallVector<Value *, 2> NewLoads; 2611 for (unsigned Part = 0; Part < UF; Part++) { 2612 Instruction *NewLoad; 2613 if (BlockInMask || MaskForGaps) { 2614 assert(useMaskedInterleavedAccesses(*TTI) && 2615 "masked interleaved groups are not allowed."); 2616 Value *GroupMask = MaskForGaps; 2617 if (BlockInMask) { 2618 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2619 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2620 Value *ShuffledMask = Builder.CreateShuffleVector( 2621 BlockInMaskPart, 2622 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2623 "interleaved.mask"); 2624 GroupMask = MaskForGaps 2625 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2626 MaskForGaps) 2627 : ShuffledMask; 2628 } 2629 NewLoad = 2630 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2631 GroupMask, PoisonVec, "wide.masked.vec"); 2632 } 2633 else 2634 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2635 Group->getAlign(), "wide.vec"); 2636 Group->addMetadata(NewLoad); 2637 NewLoads.push_back(NewLoad); 2638 } 2639 2640 // For each member in the group, shuffle out the appropriate data from the 2641 // wide loads. 2642 unsigned J = 0; 2643 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2644 Instruction *Member = Group->getMember(I); 2645 2646 // Skip the gaps in the group. 2647 if (!Member) 2648 continue; 2649 2650 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2651 auto StrideMask = 2652 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2653 for (unsigned Part = 0; Part < UF; Part++) { 2654 Value *StridedVec = Builder.CreateShuffleVector( 2655 NewLoads[Part], StrideMask, "strided.vec"); 2656 2657 // If this member has different type, cast the result type. 2658 if (Member->getType() != ScalarTy) { 2659 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2660 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2661 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2662 } 2663 2664 if (Group->isReverse()) 2665 StridedVec = reverseVector(StridedVec); 2666 2667 State.set(VPDefs[J], Member, StridedVec, Part); 2668 } 2669 ++J; 2670 } 2671 return; 2672 } 2673 2674 // The sub vector type for current instruction. 2675 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2676 auto *SubVT = VectorType::get(ScalarTy, VF); 2677 2678 // Vectorize the interleaved store group. 2679 for (unsigned Part = 0; Part < UF; Part++) { 2680 // Collect the stored vector from each member. 2681 SmallVector<Value *, 4> StoredVecs; 2682 for (unsigned i = 0; i < InterleaveFactor; i++) { 2683 // Interleaved store group doesn't allow a gap, so each index has a member 2684 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2685 2686 Value *StoredVec = State.get(StoredValues[i], Part); 2687 2688 if (Group->isReverse()) 2689 StoredVec = reverseVector(StoredVec); 2690 2691 // If this member has different type, cast it to a unified type. 2692 2693 if (StoredVec->getType() != SubVT) 2694 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2695 2696 StoredVecs.push_back(StoredVec); 2697 } 2698 2699 // Concatenate all vectors into a wide vector. 2700 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2701 2702 // Interleave the elements in the wide vector. 2703 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2704 Value *IVec = Builder.CreateShuffleVector( 2705 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2706 "interleaved.vec"); 2707 2708 Instruction *NewStoreInstr; 2709 if (BlockInMask) { 2710 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2711 Value *ShuffledMask = Builder.CreateShuffleVector( 2712 BlockInMaskPart, 2713 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2714 "interleaved.mask"); 2715 NewStoreInstr = Builder.CreateMaskedStore( 2716 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2717 } 2718 else 2719 NewStoreInstr = 2720 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2721 2722 Group->addMetadata(NewStoreInstr); 2723 } 2724 } 2725 2726 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2727 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2728 VPValue *StoredValue, VPValue *BlockInMask) { 2729 // Attempt to issue a wide load. 2730 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2731 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2732 2733 assert((LI || SI) && "Invalid Load/Store instruction"); 2734 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2735 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2736 2737 LoopVectorizationCostModel::InstWidening Decision = 2738 Cost->getWideningDecision(Instr, VF); 2739 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2740 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2741 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2742 "CM decision is not to widen the memory instruction"); 2743 2744 Type *ScalarDataTy = getMemInstValueType(Instr); 2745 2746 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2747 const Align Alignment = getLoadStoreAlignment(Instr); 2748 2749 // Determine if the pointer operand of the access is either consecutive or 2750 // reverse consecutive. 2751 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2752 bool ConsecutiveStride = 2753 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2754 bool CreateGatherScatter = 2755 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2756 2757 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2758 // gather/scatter. Otherwise Decision should have been to Scalarize. 2759 assert((ConsecutiveStride || CreateGatherScatter) && 2760 "The instruction should be scalarized"); 2761 (void)ConsecutiveStride; 2762 2763 VectorParts BlockInMaskParts(UF); 2764 bool isMaskRequired = BlockInMask; 2765 if (isMaskRequired) 2766 for (unsigned Part = 0; Part < UF; ++Part) 2767 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2768 2769 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2770 // Calculate the pointer for the specific unroll-part. 2771 GetElementPtrInst *PartPtr = nullptr; 2772 2773 bool InBounds = false; 2774 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2775 InBounds = gep->isInBounds(); 2776 2777 if (Reverse) { 2778 assert(!VF.isScalable() && 2779 "Reversing vectors is not yet supported for scalable vectors."); 2780 2781 // If the address is consecutive but reversed, then the 2782 // wide store needs to start at the last vector element. 2783 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2784 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2785 PartPtr->setIsInBounds(InBounds); 2786 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2787 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2788 PartPtr->setIsInBounds(InBounds); 2789 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2790 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2791 } else { 2792 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2793 PartPtr = cast<GetElementPtrInst>( 2794 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2795 PartPtr->setIsInBounds(InBounds); 2796 } 2797 2798 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2799 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2800 }; 2801 2802 // Handle Stores: 2803 if (SI) { 2804 setDebugLocFromInst(Builder, SI); 2805 2806 for (unsigned Part = 0; Part < UF; ++Part) { 2807 Instruction *NewSI = nullptr; 2808 Value *StoredVal = State.get(StoredValue, Part); 2809 if (CreateGatherScatter) { 2810 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2811 Value *VectorGep = State.get(Addr, Part); 2812 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2813 MaskPart); 2814 } else { 2815 if (Reverse) { 2816 // If we store to reverse consecutive memory locations, then we need 2817 // to reverse the order of elements in the stored value. 2818 StoredVal = reverseVector(StoredVal); 2819 // We don't want to update the value in the map as it might be used in 2820 // another expression. So don't call resetVectorValue(StoredVal). 2821 } 2822 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2823 if (isMaskRequired) 2824 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2825 BlockInMaskParts[Part]); 2826 else 2827 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2828 } 2829 addMetadata(NewSI, SI); 2830 } 2831 return; 2832 } 2833 2834 // Handle loads. 2835 assert(LI && "Must have a load instruction"); 2836 setDebugLocFromInst(Builder, LI); 2837 for (unsigned Part = 0; Part < UF; ++Part) { 2838 Value *NewLI; 2839 if (CreateGatherScatter) { 2840 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2841 Value *VectorGep = State.get(Addr, Part); 2842 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2843 nullptr, "wide.masked.gather"); 2844 addMetadata(NewLI, LI); 2845 } else { 2846 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2847 if (isMaskRequired) 2848 NewLI = Builder.CreateMaskedLoad( 2849 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2850 "wide.masked.load"); 2851 else 2852 NewLI = 2853 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2854 2855 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2856 addMetadata(NewLI, LI); 2857 if (Reverse) 2858 NewLI = reverseVector(NewLI); 2859 } 2860 2861 State.set(Def, Instr, NewLI, Part); 2862 } 2863 } 2864 2865 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2866 const VPIteration &Instance, 2867 bool IfPredicateInstr, 2868 VPTransformState &State) { 2869 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2870 2871 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2872 // the first lane and part. 2873 if (auto *II = dyn_cast<IntrinsicInst>(Instr)) 2874 if (Instance.Lane != 0 || Instance.Part != 0) 2875 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) 2876 return; 2877 2878 setDebugLocFromInst(Builder, Instr); 2879 2880 // Does this instruction return a value ? 2881 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2882 2883 Instruction *Cloned = Instr->clone(); 2884 if (!IsVoidRetTy) 2885 Cloned->setName(Instr->getName() + ".cloned"); 2886 2887 // Replace the operands of the cloned instructions with their scalar 2888 // equivalents in the new loop. 2889 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2890 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2891 auto InputInstance = Instance; 2892 if (!Operand || !OrigLoop->contains(Operand) || 2893 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2894 InputInstance.Lane = 0; 2895 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2896 Cloned->setOperand(op, NewOp); 2897 } 2898 addNewMetadata(Cloned, Instr); 2899 2900 // Place the cloned scalar in the new loop. 2901 Builder.Insert(Cloned); 2902 2903 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2904 // representing scalar values in VPTransformState. Add the cloned scalar to 2905 // the scalar map entry. 2906 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2907 2908 // If we just cloned a new assumption, add it the assumption cache. 2909 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2910 if (II->getIntrinsicID() == Intrinsic::assume) 2911 AC->registerAssumption(II); 2912 2913 // End if-block. 2914 if (IfPredicateInstr) 2915 PredicatedInstructions.push_back(Cloned); 2916 } 2917 2918 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2919 Value *End, Value *Step, 2920 Instruction *DL) { 2921 BasicBlock *Header = L->getHeader(); 2922 BasicBlock *Latch = L->getLoopLatch(); 2923 // As we're just creating this loop, it's possible no latch exists 2924 // yet. If so, use the header as this will be a single block loop. 2925 if (!Latch) 2926 Latch = Header; 2927 2928 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2929 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2930 setDebugLocFromInst(Builder, OldInst); 2931 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2932 2933 Builder.SetInsertPoint(Latch->getTerminator()); 2934 setDebugLocFromInst(Builder, OldInst); 2935 2936 // Create i+1 and fill the PHINode. 2937 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2938 Induction->addIncoming(Start, L->getLoopPreheader()); 2939 Induction->addIncoming(Next, Latch); 2940 // Create the compare. 2941 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2942 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2943 2944 // Now we have two terminators. Remove the old one from the block. 2945 Latch->getTerminator()->eraseFromParent(); 2946 2947 return Induction; 2948 } 2949 2950 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2951 if (TripCount) 2952 return TripCount; 2953 2954 assert(L && "Create Trip Count for null loop."); 2955 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2956 // Find the loop boundaries. 2957 ScalarEvolution *SE = PSE.getSE(); 2958 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2959 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2960 "Invalid loop count"); 2961 2962 Type *IdxTy = Legal->getWidestInductionType(); 2963 assert(IdxTy && "No type for induction"); 2964 2965 // The exit count might have the type of i64 while the phi is i32. This can 2966 // happen if we have an induction variable that is sign extended before the 2967 // compare. The only way that we get a backedge taken count is that the 2968 // induction variable was signed and as such will not overflow. In such a case 2969 // truncation is legal. 2970 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2971 IdxTy->getPrimitiveSizeInBits()) 2972 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2973 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2974 2975 // Get the total trip count from the count by adding 1. 2976 const SCEV *ExitCount = SE->getAddExpr( 2977 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2978 2979 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2980 2981 // Expand the trip count and place the new instructions in the preheader. 2982 // Notice that the pre-header does not change, only the loop body. 2983 SCEVExpander Exp(*SE, DL, "induction"); 2984 2985 // Count holds the overall loop count (N). 2986 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2987 L->getLoopPreheader()->getTerminator()); 2988 2989 if (TripCount->getType()->isPointerTy()) 2990 TripCount = 2991 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2992 L->getLoopPreheader()->getTerminator()); 2993 2994 return TripCount; 2995 } 2996 2997 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2998 if (VectorTripCount) 2999 return VectorTripCount; 3000 3001 Value *TC = getOrCreateTripCount(L); 3002 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3003 3004 Type *Ty = TC->getType(); 3005 // This is where we can make the step a runtime constant. 3006 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3007 3008 // If the tail is to be folded by masking, round the number of iterations N 3009 // up to a multiple of Step instead of rounding down. This is done by first 3010 // adding Step-1 and then rounding down. Note that it's ok if this addition 3011 // overflows: the vector induction variable will eventually wrap to zero given 3012 // that it starts at zero and its Step is a power of two; the loop will then 3013 // exit, with the last early-exit vector comparison also producing all-true. 3014 if (Cost->foldTailByMasking()) { 3015 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3016 "VF*UF must be a power of 2 when folding tail by masking"); 3017 assert(!VF.isScalable() && 3018 "Tail folding not yet supported for scalable vectors"); 3019 TC = Builder.CreateAdd( 3020 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3021 } 3022 3023 // Now we need to generate the expression for the part of the loop that the 3024 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3025 // iterations are not required for correctness, or N - Step, otherwise. Step 3026 // is equal to the vectorization factor (number of SIMD elements) times the 3027 // unroll factor (number of SIMD instructions). 3028 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3029 3030 // There are two cases where we need to ensure (at least) the last iteration 3031 // runs in the scalar remainder loop. Thus, if the step evenly divides 3032 // the trip count, we set the remainder to be equal to the step. If the step 3033 // does not evenly divide the trip count, no adjustment is necessary since 3034 // there will already be scalar iterations. Note that the minimum iterations 3035 // check ensures that N >= Step. The cases are: 3036 // 1) If there is a non-reversed interleaved group that may speculatively 3037 // access memory out-of-bounds. 3038 // 2) If any instruction may follow a conditionally taken exit. That is, if 3039 // the loop contains multiple exiting blocks, or a single exiting block 3040 // which is not the latch. 3041 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3042 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3043 R = Builder.CreateSelect(IsZero, Step, R); 3044 } 3045 3046 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3047 3048 return VectorTripCount; 3049 } 3050 3051 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3052 const DataLayout &DL) { 3053 // Verify that V is a vector type with same number of elements as DstVTy. 3054 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3055 unsigned VF = DstFVTy->getNumElements(); 3056 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3057 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3058 Type *SrcElemTy = SrcVecTy->getElementType(); 3059 Type *DstElemTy = DstFVTy->getElementType(); 3060 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3061 "Vector elements must have same size"); 3062 3063 // Do a direct cast if element types are castable. 3064 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3065 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3066 } 3067 // V cannot be directly casted to desired vector type. 3068 // May happen when V is a floating point vector but DstVTy is a vector of 3069 // pointers or vice-versa. Handle this using a two-step bitcast using an 3070 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3071 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3072 "Only one type should be a pointer type"); 3073 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3074 "Only one type should be a floating point type"); 3075 Type *IntTy = 3076 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3077 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3078 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3079 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3080 } 3081 3082 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3083 BasicBlock *Bypass) { 3084 Value *Count = getOrCreateTripCount(L); 3085 // Reuse existing vector loop preheader for TC checks. 3086 // Note that new preheader block is generated for vector loop. 3087 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3088 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3089 3090 // Generate code to check if the loop's trip count is less than VF * UF, or 3091 // equal to it in case a scalar epilogue is required; this implies that the 3092 // vector trip count is zero. This check also covers the case where adding one 3093 // to the backedge-taken count overflowed leading to an incorrect trip count 3094 // of zero. In this case we will also jump to the scalar loop. 3095 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3096 : ICmpInst::ICMP_ULT; 3097 3098 // If tail is to be folded, vector loop takes care of all iterations. 3099 Value *CheckMinIters = Builder.getFalse(); 3100 if (!Cost->foldTailByMasking()) { 3101 Value *Step = 3102 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3103 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3104 } 3105 // Create new preheader for vector loop. 3106 LoopVectorPreHeader = 3107 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3108 "vector.ph"); 3109 3110 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3111 DT->getNode(Bypass)->getIDom()) && 3112 "TC check is expected to dominate Bypass"); 3113 3114 // Update dominator for Bypass & LoopExit. 3115 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3116 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3117 3118 ReplaceInstWithInst( 3119 TCCheckBlock->getTerminator(), 3120 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3121 LoopBypassBlocks.push_back(TCCheckBlock); 3122 } 3123 3124 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3125 // Reuse existing vector loop preheader for SCEV checks. 3126 // Note that new preheader block is generated for vector loop. 3127 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3128 3129 // Generate the code to check that the SCEV assumptions that we made. 3130 // We want the new basic block to start at the first instruction in a 3131 // sequence of instructions that form a check. 3132 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3133 "scev.check"); 3134 Value *SCEVCheck = Exp.expandCodeForPredicate( 3135 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3136 3137 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3138 if (C->isZero()) 3139 return; 3140 3141 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3142 (OptForSizeBasedOnProfile && 3143 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3144 "Cannot SCEV check stride or overflow when optimizing for size"); 3145 3146 SCEVCheckBlock->setName("vector.scevcheck"); 3147 // Create new preheader for vector loop. 3148 LoopVectorPreHeader = 3149 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3150 nullptr, "vector.ph"); 3151 3152 // Update dominator only if this is first RT check. 3153 if (LoopBypassBlocks.empty()) { 3154 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3155 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3156 } 3157 3158 ReplaceInstWithInst( 3159 SCEVCheckBlock->getTerminator(), 3160 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3161 LoopBypassBlocks.push_back(SCEVCheckBlock); 3162 AddedSafetyChecks = true; 3163 } 3164 3165 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3166 // VPlan-native path does not do any analysis for runtime checks currently. 3167 if (EnableVPlanNativePath) 3168 return; 3169 3170 // Reuse existing vector loop preheader for runtime memory checks. 3171 // Note that new preheader block is generated for vector loop. 3172 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3173 3174 // Generate the code that checks in runtime if arrays overlap. We put the 3175 // checks into a separate block to make the more common case of few elements 3176 // faster. 3177 auto *LAI = Legal->getLAI(); 3178 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3179 if (!RtPtrChecking.Need) 3180 return; 3181 3182 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3183 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3184 "Cannot emit memory checks when optimizing for size, unless forced " 3185 "to vectorize."); 3186 ORE->emit([&]() { 3187 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3188 L->getStartLoc(), L->getHeader()) 3189 << "Code-size may be reduced by not forcing " 3190 "vectorization, or by source-code modifications " 3191 "eliminating the need for runtime checks " 3192 "(e.g., adding 'restrict')."; 3193 }); 3194 } 3195 3196 MemCheckBlock->setName("vector.memcheck"); 3197 // Create new preheader for vector loop. 3198 LoopVectorPreHeader = 3199 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3200 "vector.ph"); 3201 3202 auto *CondBranch = cast<BranchInst>( 3203 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3204 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3205 LoopBypassBlocks.push_back(MemCheckBlock); 3206 AddedSafetyChecks = true; 3207 3208 // Update dominator only if this is first RT check. 3209 if (LoopBypassBlocks.empty()) { 3210 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3211 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3212 } 3213 3214 Instruction *FirstCheckInst; 3215 Instruction *MemRuntimeCheck; 3216 std::tie(FirstCheckInst, MemRuntimeCheck) = 3217 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3218 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3219 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3220 "claimed checks are required"); 3221 CondBranch->setCondition(MemRuntimeCheck); 3222 3223 // We currently don't use LoopVersioning for the actual loop cloning but we 3224 // still use it to add the noalias metadata. 3225 LVer = std::make_unique<LoopVersioning>( 3226 *Legal->getLAI(), 3227 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3228 DT, PSE.getSE()); 3229 LVer->prepareNoAliasMetadata(); 3230 } 3231 3232 Value *InnerLoopVectorizer::emitTransformedIndex( 3233 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3234 const InductionDescriptor &ID) const { 3235 3236 SCEVExpander Exp(*SE, DL, "induction"); 3237 auto Step = ID.getStep(); 3238 auto StartValue = ID.getStartValue(); 3239 assert(Index->getType() == Step->getType() && 3240 "Index type does not match StepValue type"); 3241 3242 // Note: the IR at this point is broken. We cannot use SE to create any new 3243 // SCEV and then expand it, hoping that SCEV's simplification will give us 3244 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3245 // lead to various SCEV crashes. So all we can do is to use builder and rely 3246 // on InstCombine for future simplifications. Here we handle some trivial 3247 // cases only. 3248 auto CreateAdd = [&B](Value *X, Value *Y) { 3249 assert(X->getType() == Y->getType() && "Types don't match!"); 3250 if (auto *CX = dyn_cast<ConstantInt>(X)) 3251 if (CX->isZero()) 3252 return Y; 3253 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3254 if (CY->isZero()) 3255 return X; 3256 return B.CreateAdd(X, Y); 3257 }; 3258 3259 auto CreateMul = [&B](Value *X, Value *Y) { 3260 assert(X->getType() == Y->getType() && "Types don't match!"); 3261 if (auto *CX = dyn_cast<ConstantInt>(X)) 3262 if (CX->isOne()) 3263 return Y; 3264 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3265 if (CY->isOne()) 3266 return X; 3267 return B.CreateMul(X, Y); 3268 }; 3269 3270 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3271 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3272 // the DomTree is not kept up-to-date for additional blocks generated in the 3273 // vector loop. By using the header as insertion point, we guarantee that the 3274 // expanded instructions dominate all their uses. 3275 auto GetInsertPoint = [this, &B]() { 3276 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3277 if (InsertBB != LoopVectorBody && 3278 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3279 return LoopVectorBody->getTerminator(); 3280 return &*B.GetInsertPoint(); 3281 }; 3282 switch (ID.getKind()) { 3283 case InductionDescriptor::IK_IntInduction: { 3284 assert(Index->getType() == StartValue->getType() && 3285 "Index type does not match StartValue type"); 3286 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3287 return B.CreateSub(StartValue, Index); 3288 auto *Offset = CreateMul( 3289 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3290 return CreateAdd(StartValue, Offset); 3291 } 3292 case InductionDescriptor::IK_PtrInduction: { 3293 assert(isa<SCEVConstant>(Step) && 3294 "Expected constant step for pointer induction"); 3295 return B.CreateGEP( 3296 StartValue->getType()->getPointerElementType(), StartValue, 3297 CreateMul(Index, 3298 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3299 } 3300 case InductionDescriptor::IK_FpInduction: { 3301 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3302 auto InductionBinOp = ID.getInductionBinOp(); 3303 assert(InductionBinOp && 3304 (InductionBinOp->getOpcode() == Instruction::FAdd || 3305 InductionBinOp->getOpcode() == Instruction::FSub) && 3306 "Original bin op should be defined for FP induction"); 3307 3308 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3309 3310 // Floating point operations had to be 'fast' to enable the induction. 3311 FastMathFlags Flags; 3312 Flags.setFast(); 3313 3314 Value *MulExp = B.CreateFMul(StepValue, Index); 3315 if (isa<Instruction>(MulExp)) 3316 // We have to check, the MulExp may be a constant. 3317 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3318 3319 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3320 "induction"); 3321 if (isa<Instruction>(BOp)) 3322 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3323 3324 return BOp; 3325 } 3326 case InductionDescriptor::IK_NoInduction: 3327 return nullptr; 3328 } 3329 llvm_unreachable("invalid enum"); 3330 } 3331 3332 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3333 LoopScalarBody = OrigLoop->getHeader(); 3334 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3335 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3336 assert(LoopExitBlock && "Must have an exit block"); 3337 assert(LoopVectorPreHeader && "Invalid loop structure"); 3338 3339 LoopMiddleBlock = 3340 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3341 LI, nullptr, Twine(Prefix) + "middle.block"); 3342 LoopScalarPreHeader = 3343 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3344 nullptr, Twine(Prefix) + "scalar.ph"); 3345 3346 // Set up branch from middle block to the exit and scalar preheader blocks. 3347 // completeLoopSkeleton will update the condition to use an iteration check, 3348 // if required to decide whether to execute the remainder. 3349 BranchInst *BrInst = 3350 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3351 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3352 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3353 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3354 3355 // We intentionally don't let SplitBlock to update LoopInfo since 3356 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3357 // LoopVectorBody is explicitly added to the correct place few lines later. 3358 LoopVectorBody = 3359 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3360 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3361 3362 // Update dominator for loop exit. 3363 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3364 3365 // Create and register the new vector loop. 3366 Loop *Lp = LI->AllocateLoop(); 3367 Loop *ParentLoop = OrigLoop->getParentLoop(); 3368 3369 // Insert the new loop into the loop nest and register the new basic blocks 3370 // before calling any utilities such as SCEV that require valid LoopInfo. 3371 if (ParentLoop) { 3372 ParentLoop->addChildLoop(Lp); 3373 } else { 3374 LI->addTopLevelLoop(Lp); 3375 } 3376 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3377 return Lp; 3378 } 3379 3380 void InnerLoopVectorizer::createInductionResumeValues( 3381 Loop *L, Value *VectorTripCount, 3382 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3383 assert(VectorTripCount && L && "Expected valid arguments"); 3384 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3385 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3386 "Inconsistent information about additional bypass."); 3387 // We are going to resume the execution of the scalar loop. 3388 // Go over all of the induction variables that we found and fix the 3389 // PHIs that are left in the scalar version of the loop. 3390 // The starting values of PHI nodes depend on the counter of the last 3391 // iteration in the vectorized loop. 3392 // If we come from a bypass edge then we need to start from the original 3393 // start value. 3394 for (auto &InductionEntry : Legal->getInductionVars()) { 3395 PHINode *OrigPhi = InductionEntry.first; 3396 InductionDescriptor II = InductionEntry.second; 3397 3398 // Create phi nodes to merge from the backedge-taken check block. 3399 PHINode *BCResumeVal = 3400 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3401 LoopScalarPreHeader->getTerminator()); 3402 // Copy original phi DL over to the new one. 3403 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3404 Value *&EndValue = IVEndValues[OrigPhi]; 3405 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3406 if (OrigPhi == OldInduction) { 3407 // We know what the end value is. 3408 EndValue = VectorTripCount; 3409 } else { 3410 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3411 Type *StepType = II.getStep()->getType(); 3412 Instruction::CastOps CastOp = 3413 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3414 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3415 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3416 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3417 EndValue->setName("ind.end"); 3418 3419 // Compute the end value for the additional bypass (if applicable). 3420 if (AdditionalBypass.first) { 3421 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3422 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3423 StepType, true); 3424 CRD = 3425 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3426 EndValueFromAdditionalBypass = 3427 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3428 EndValueFromAdditionalBypass->setName("ind.end"); 3429 } 3430 } 3431 // The new PHI merges the original incoming value, in case of a bypass, 3432 // or the value at the end of the vectorized loop. 3433 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3434 3435 // Fix the scalar body counter (PHI node). 3436 // The old induction's phi node in the scalar body needs the truncated 3437 // value. 3438 for (BasicBlock *BB : LoopBypassBlocks) 3439 BCResumeVal->addIncoming(II.getStartValue(), BB); 3440 3441 if (AdditionalBypass.first) 3442 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3443 EndValueFromAdditionalBypass); 3444 3445 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3446 } 3447 } 3448 3449 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3450 MDNode *OrigLoopID) { 3451 assert(L && "Expected valid loop."); 3452 3453 // The trip counts should be cached by now. 3454 Value *Count = getOrCreateTripCount(L); 3455 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3456 3457 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3458 3459 // Add a check in the middle block to see if we have completed 3460 // all of the iterations in the first vector loop. 3461 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3462 // If tail is to be folded, we know we don't need to run the remainder. 3463 if (!Cost->foldTailByMasking()) { 3464 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3465 Count, VectorTripCount, "cmp.n", 3466 LoopMiddleBlock->getTerminator()); 3467 3468 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3469 // of the corresponding compare because they may have ended up with 3470 // different line numbers and we want to avoid awkward line stepping while 3471 // debugging. Eg. if the compare has got a line number inside the loop. 3472 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3473 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3474 } 3475 3476 // Get ready to start creating new instructions into the vectorized body. 3477 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3478 "Inconsistent vector loop preheader"); 3479 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3480 3481 Optional<MDNode *> VectorizedLoopID = 3482 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3483 LLVMLoopVectorizeFollowupVectorized}); 3484 if (VectorizedLoopID.hasValue()) { 3485 L->setLoopID(VectorizedLoopID.getValue()); 3486 3487 // Do not setAlreadyVectorized if loop attributes have been defined 3488 // explicitly. 3489 return LoopVectorPreHeader; 3490 } 3491 3492 // Keep all loop hints from the original loop on the vector loop (we'll 3493 // replace the vectorizer-specific hints below). 3494 if (MDNode *LID = OrigLoop->getLoopID()) 3495 L->setLoopID(LID); 3496 3497 LoopVectorizeHints Hints(L, true, *ORE); 3498 Hints.setAlreadyVectorized(); 3499 3500 #ifdef EXPENSIVE_CHECKS 3501 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3502 LI->verify(*DT); 3503 #endif 3504 3505 return LoopVectorPreHeader; 3506 } 3507 3508 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3509 /* 3510 In this function we generate a new loop. The new loop will contain 3511 the vectorized instructions while the old loop will continue to run the 3512 scalar remainder. 3513 3514 [ ] <-- loop iteration number check. 3515 / | 3516 / v 3517 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3518 | / | 3519 | / v 3520 || [ ] <-- vector pre header. 3521 |/ | 3522 | v 3523 | [ ] \ 3524 | [ ]_| <-- vector loop. 3525 | | 3526 | v 3527 | -[ ] <--- middle-block. 3528 | / | 3529 | / v 3530 -|- >[ ] <--- new preheader. 3531 | | 3532 | v 3533 | [ ] \ 3534 | [ ]_| <-- old scalar loop to handle remainder. 3535 \ | 3536 \ v 3537 >[ ] <-- exit block. 3538 ... 3539 */ 3540 3541 // Get the metadata of the original loop before it gets modified. 3542 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3543 3544 // Create an empty vector loop, and prepare basic blocks for the runtime 3545 // checks. 3546 Loop *Lp = createVectorLoopSkeleton(""); 3547 3548 // Now, compare the new count to zero. If it is zero skip the vector loop and 3549 // jump to the scalar loop. This check also covers the case where the 3550 // backedge-taken count is uint##_max: adding one to it will overflow leading 3551 // to an incorrect trip count of zero. In this (rare) case we will also jump 3552 // to the scalar loop. 3553 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3554 3555 // Generate the code to check any assumptions that we've made for SCEV 3556 // expressions. 3557 emitSCEVChecks(Lp, LoopScalarPreHeader); 3558 3559 // Generate the code that checks in runtime if arrays overlap. We put the 3560 // checks into a separate block to make the more common case of few elements 3561 // faster. 3562 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3563 3564 // Some loops have a single integer induction variable, while other loops 3565 // don't. One example is c++ iterators that often have multiple pointer 3566 // induction variables. In the code below we also support a case where we 3567 // don't have a single induction variable. 3568 // 3569 // We try to obtain an induction variable from the original loop as hard 3570 // as possible. However if we don't find one that: 3571 // - is an integer 3572 // - counts from zero, stepping by one 3573 // - is the size of the widest induction variable type 3574 // then we create a new one. 3575 OldInduction = Legal->getPrimaryInduction(); 3576 Type *IdxTy = Legal->getWidestInductionType(); 3577 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3578 // The loop step is equal to the vectorization factor (num of SIMD elements) 3579 // times the unroll factor (num of SIMD instructions). 3580 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3581 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3582 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3583 Induction = 3584 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3585 getDebugLocFromInstOrOperands(OldInduction)); 3586 3587 // Emit phis for the new starting index of the scalar loop. 3588 createInductionResumeValues(Lp, CountRoundDown); 3589 3590 return completeLoopSkeleton(Lp, OrigLoopID); 3591 } 3592 3593 // Fix up external users of the induction variable. At this point, we are 3594 // in LCSSA form, with all external PHIs that use the IV having one input value, 3595 // coming from the remainder loop. We need those PHIs to also have a correct 3596 // value for the IV when arriving directly from the middle block. 3597 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3598 const InductionDescriptor &II, 3599 Value *CountRoundDown, Value *EndValue, 3600 BasicBlock *MiddleBlock) { 3601 // There are two kinds of external IV usages - those that use the value 3602 // computed in the last iteration (the PHI) and those that use the penultimate 3603 // value (the value that feeds into the phi from the loop latch). 3604 // We allow both, but they, obviously, have different values. 3605 3606 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3607 3608 DenseMap<Value *, Value *> MissingVals; 3609 3610 // An external user of the last iteration's value should see the value that 3611 // the remainder loop uses to initialize its own IV. 3612 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3613 for (User *U : PostInc->users()) { 3614 Instruction *UI = cast<Instruction>(U); 3615 if (!OrigLoop->contains(UI)) { 3616 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3617 MissingVals[UI] = EndValue; 3618 } 3619 } 3620 3621 // An external user of the penultimate value need to see EndValue - Step. 3622 // The simplest way to get this is to recompute it from the constituent SCEVs, 3623 // that is Start + (Step * (CRD - 1)). 3624 for (User *U : OrigPhi->users()) { 3625 auto *UI = cast<Instruction>(U); 3626 if (!OrigLoop->contains(UI)) { 3627 const DataLayout &DL = 3628 OrigLoop->getHeader()->getModule()->getDataLayout(); 3629 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3630 3631 IRBuilder<> B(MiddleBlock->getTerminator()); 3632 Value *CountMinusOne = B.CreateSub( 3633 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3634 Value *CMO = 3635 !II.getStep()->getType()->isIntegerTy() 3636 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3637 II.getStep()->getType()) 3638 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3639 CMO->setName("cast.cmo"); 3640 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3641 Escape->setName("ind.escape"); 3642 MissingVals[UI] = Escape; 3643 } 3644 } 3645 3646 for (auto &I : MissingVals) { 3647 PHINode *PHI = cast<PHINode>(I.first); 3648 // One corner case we have to handle is two IVs "chasing" each-other, 3649 // that is %IV2 = phi [...], [ %IV1, %latch ] 3650 // In this case, if IV1 has an external use, we need to avoid adding both 3651 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3652 // don't already have an incoming value for the middle block. 3653 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3654 PHI->addIncoming(I.second, MiddleBlock); 3655 } 3656 } 3657 3658 namespace { 3659 3660 struct CSEDenseMapInfo { 3661 static bool canHandle(const Instruction *I) { 3662 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3663 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3664 } 3665 3666 static inline Instruction *getEmptyKey() { 3667 return DenseMapInfo<Instruction *>::getEmptyKey(); 3668 } 3669 3670 static inline Instruction *getTombstoneKey() { 3671 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3672 } 3673 3674 static unsigned getHashValue(const Instruction *I) { 3675 assert(canHandle(I) && "Unknown instruction!"); 3676 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3677 I->value_op_end())); 3678 } 3679 3680 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3681 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3682 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3683 return LHS == RHS; 3684 return LHS->isIdenticalTo(RHS); 3685 } 3686 }; 3687 3688 } // end anonymous namespace 3689 3690 ///Perform cse of induction variable instructions. 3691 static void cse(BasicBlock *BB) { 3692 // Perform simple cse. 3693 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3694 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3695 Instruction *In = &*I++; 3696 3697 if (!CSEDenseMapInfo::canHandle(In)) 3698 continue; 3699 3700 // Check if we can replace this instruction with any of the 3701 // visited instructions. 3702 if (Instruction *V = CSEMap.lookup(In)) { 3703 In->replaceAllUsesWith(V); 3704 In->eraseFromParent(); 3705 continue; 3706 } 3707 3708 CSEMap[In] = In; 3709 } 3710 } 3711 3712 InstructionCost 3713 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3714 bool &NeedToScalarize) { 3715 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3716 Function *F = CI->getCalledFunction(); 3717 Type *ScalarRetTy = CI->getType(); 3718 SmallVector<Type *, 4> Tys, ScalarTys; 3719 for (auto &ArgOp : CI->arg_operands()) 3720 ScalarTys.push_back(ArgOp->getType()); 3721 3722 // Estimate cost of scalarized vector call. The source operands are assumed 3723 // to be vectors, so we need to extract individual elements from there, 3724 // execute VF scalar calls, and then gather the result into the vector return 3725 // value. 3726 InstructionCost ScalarCallCost = 3727 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3728 if (VF.isScalar()) 3729 return ScalarCallCost; 3730 3731 // Compute corresponding vector type for return value and arguments. 3732 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3733 for (Type *ScalarTy : ScalarTys) 3734 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3735 3736 // Compute costs of unpacking argument values for the scalar calls and 3737 // packing the return values to a vector. 3738 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3739 3740 InstructionCost Cost = 3741 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3742 3743 // If we can't emit a vector call for this function, then the currently found 3744 // cost is the cost we need to return. 3745 NeedToScalarize = true; 3746 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3747 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3748 3749 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3750 return Cost; 3751 3752 // If the corresponding vector cost is cheaper, return its cost. 3753 InstructionCost VectorCallCost = 3754 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3755 if (VectorCallCost < Cost) { 3756 NeedToScalarize = false; 3757 Cost = VectorCallCost; 3758 } 3759 return Cost; 3760 } 3761 3762 InstructionCost 3763 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3764 ElementCount VF) { 3765 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3766 assert(ID && "Expected intrinsic call!"); 3767 3768 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3769 return TTI.getIntrinsicInstrCost(CostAttrs, 3770 TargetTransformInfo::TCK_RecipThroughput); 3771 } 3772 3773 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3774 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3775 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3776 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3777 } 3778 3779 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3780 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3781 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3782 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3783 } 3784 3785 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3786 // For every instruction `I` in MinBWs, truncate the operands, create a 3787 // truncated version of `I` and reextend its result. InstCombine runs 3788 // later and will remove any ext/trunc pairs. 3789 SmallPtrSet<Value *, 4> Erased; 3790 for (const auto &KV : Cost->getMinimalBitwidths()) { 3791 // If the value wasn't vectorized, we must maintain the original scalar 3792 // type. The absence of the value from VectorLoopValueMap indicates that it 3793 // wasn't vectorized. 3794 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3795 continue; 3796 for (unsigned Part = 0; Part < UF; ++Part) { 3797 Value *I = getOrCreateVectorValue(KV.first, Part); 3798 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3799 continue; 3800 Type *OriginalTy = I->getType(); 3801 Type *ScalarTruncatedTy = 3802 IntegerType::get(OriginalTy->getContext(), KV.second); 3803 auto *TruncatedTy = FixedVectorType::get( 3804 ScalarTruncatedTy, 3805 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3806 if (TruncatedTy == OriginalTy) 3807 continue; 3808 3809 IRBuilder<> B(cast<Instruction>(I)); 3810 auto ShrinkOperand = [&](Value *V) -> Value * { 3811 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3812 if (ZI->getSrcTy() == TruncatedTy) 3813 return ZI->getOperand(0); 3814 return B.CreateZExtOrTrunc(V, TruncatedTy); 3815 }; 3816 3817 // The actual instruction modification depends on the instruction type, 3818 // unfortunately. 3819 Value *NewI = nullptr; 3820 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3821 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3822 ShrinkOperand(BO->getOperand(1))); 3823 3824 // Any wrapping introduced by shrinking this operation shouldn't be 3825 // considered undefined behavior. So, we can't unconditionally copy 3826 // arithmetic wrapping flags to NewI. 3827 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3828 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3829 NewI = 3830 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3831 ShrinkOperand(CI->getOperand(1))); 3832 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3833 NewI = B.CreateSelect(SI->getCondition(), 3834 ShrinkOperand(SI->getTrueValue()), 3835 ShrinkOperand(SI->getFalseValue())); 3836 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3837 switch (CI->getOpcode()) { 3838 default: 3839 llvm_unreachable("Unhandled cast!"); 3840 case Instruction::Trunc: 3841 NewI = ShrinkOperand(CI->getOperand(0)); 3842 break; 3843 case Instruction::SExt: 3844 NewI = B.CreateSExtOrTrunc( 3845 CI->getOperand(0), 3846 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3847 break; 3848 case Instruction::ZExt: 3849 NewI = B.CreateZExtOrTrunc( 3850 CI->getOperand(0), 3851 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3852 break; 3853 } 3854 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3855 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3856 ->getNumElements(); 3857 auto *O0 = B.CreateZExtOrTrunc( 3858 SI->getOperand(0), 3859 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3860 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3861 ->getNumElements(); 3862 auto *O1 = B.CreateZExtOrTrunc( 3863 SI->getOperand(1), 3864 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3865 3866 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3867 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3868 // Don't do anything with the operands, just extend the result. 3869 continue; 3870 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3871 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3872 ->getNumElements(); 3873 auto *O0 = B.CreateZExtOrTrunc( 3874 IE->getOperand(0), 3875 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3876 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3877 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3878 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3879 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3880 ->getNumElements(); 3881 auto *O0 = B.CreateZExtOrTrunc( 3882 EE->getOperand(0), 3883 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3884 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3885 } else { 3886 // If we don't know what to do, be conservative and don't do anything. 3887 continue; 3888 } 3889 3890 // Lastly, extend the result. 3891 NewI->takeName(cast<Instruction>(I)); 3892 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3893 I->replaceAllUsesWith(Res); 3894 cast<Instruction>(I)->eraseFromParent(); 3895 Erased.insert(I); 3896 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3897 } 3898 } 3899 3900 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3901 for (const auto &KV : Cost->getMinimalBitwidths()) { 3902 // If the value wasn't vectorized, we must maintain the original scalar 3903 // type. The absence of the value from VectorLoopValueMap indicates that it 3904 // wasn't vectorized. 3905 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3906 continue; 3907 for (unsigned Part = 0; Part < UF; ++Part) { 3908 Value *I = getOrCreateVectorValue(KV.first, Part); 3909 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3910 if (Inst && Inst->use_empty()) { 3911 Value *NewI = Inst->getOperand(0); 3912 Inst->eraseFromParent(); 3913 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3914 } 3915 } 3916 } 3917 } 3918 3919 void InnerLoopVectorizer::fixVectorizedLoop() { 3920 // Insert truncates and extends for any truncated instructions as hints to 3921 // InstCombine. 3922 if (VF.isVector()) 3923 truncateToMinimalBitwidths(); 3924 3925 // Fix widened non-induction PHIs by setting up the PHI operands. 3926 if (OrigPHIsToFix.size()) { 3927 assert(EnableVPlanNativePath && 3928 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3929 fixNonInductionPHIs(); 3930 } 3931 3932 // At this point every instruction in the original loop is widened to a 3933 // vector form. Now we need to fix the recurrences in the loop. These PHI 3934 // nodes are currently empty because we did not want to introduce cycles. 3935 // This is the second stage of vectorizing recurrences. 3936 fixCrossIterationPHIs(); 3937 3938 // Forget the original basic block. 3939 PSE.getSE()->forgetLoop(OrigLoop); 3940 3941 // Fix-up external users of the induction variables. 3942 for (auto &Entry : Legal->getInductionVars()) 3943 fixupIVUsers(Entry.first, Entry.second, 3944 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3945 IVEndValues[Entry.first], LoopMiddleBlock); 3946 3947 fixLCSSAPHIs(); 3948 for (Instruction *PI : PredicatedInstructions) 3949 sinkScalarOperands(&*PI); 3950 3951 // Remove redundant induction instructions. 3952 cse(LoopVectorBody); 3953 3954 // Set/update profile weights for the vector and remainder loops as original 3955 // loop iterations are now distributed among them. Note that original loop 3956 // represented by LoopScalarBody becomes remainder loop after vectorization. 3957 // 3958 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3959 // end up getting slightly roughened result but that should be OK since 3960 // profile is not inherently precise anyway. Note also possible bypass of 3961 // vector code caused by legality checks is ignored, assigning all the weight 3962 // to the vector loop, optimistically. 3963 // 3964 // For scalable vectorization we can't know at compile time how many iterations 3965 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3966 // vscale of '1'. 3967 setProfileInfoAfterUnrolling( 3968 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3969 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3970 } 3971 3972 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3973 // In order to support recurrences we need to be able to vectorize Phi nodes. 3974 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3975 // stage #2: We now need to fix the recurrences by adding incoming edges to 3976 // the currently empty PHI nodes. At this point every instruction in the 3977 // original loop is widened to a vector form so we can use them to construct 3978 // the incoming edges. 3979 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3980 // Handle first-order recurrences and reductions that need to be fixed. 3981 if (Legal->isFirstOrderRecurrence(&Phi)) 3982 fixFirstOrderRecurrence(&Phi); 3983 else if (Legal->isReductionVariable(&Phi)) 3984 fixReduction(&Phi); 3985 } 3986 } 3987 3988 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3989 // This is the second phase of vectorizing first-order recurrences. An 3990 // overview of the transformation is described below. Suppose we have the 3991 // following loop. 3992 // 3993 // for (int i = 0; i < n; ++i) 3994 // b[i] = a[i] - a[i - 1]; 3995 // 3996 // There is a first-order recurrence on "a". For this loop, the shorthand 3997 // scalar IR looks like: 3998 // 3999 // scalar.ph: 4000 // s_init = a[-1] 4001 // br scalar.body 4002 // 4003 // scalar.body: 4004 // i = phi [0, scalar.ph], [i+1, scalar.body] 4005 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4006 // s2 = a[i] 4007 // b[i] = s2 - s1 4008 // br cond, scalar.body, ... 4009 // 4010 // In this example, s1 is a recurrence because it's value depends on the 4011 // previous iteration. In the first phase of vectorization, we created a 4012 // temporary value for s1. We now complete the vectorization and produce the 4013 // shorthand vector IR shown below (for VF = 4, UF = 1). 4014 // 4015 // vector.ph: 4016 // v_init = vector(..., ..., ..., a[-1]) 4017 // br vector.body 4018 // 4019 // vector.body 4020 // i = phi [0, vector.ph], [i+4, vector.body] 4021 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4022 // v2 = a[i, i+1, i+2, i+3]; 4023 // v3 = vector(v1(3), v2(0, 1, 2)) 4024 // b[i, i+1, i+2, i+3] = v2 - v3 4025 // br cond, vector.body, middle.block 4026 // 4027 // middle.block: 4028 // x = v2(3) 4029 // br scalar.ph 4030 // 4031 // scalar.ph: 4032 // s_init = phi [x, middle.block], [a[-1], otherwise] 4033 // br scalar.body 4034 // 4035 // After execution completes the vector loop, we extract the next value of 4036 // the recurrence (x) to use as the initial value in the scalar loop. 4037 4038 // Get the original loop preheader and single loop latch. 4039 auto *Preheader = OrigLoop->getLoopPreheader(); 4040 auto *Latch = OrigLoop->getLoopLatch(); 4041 4042 // Get the initial and previous values of the scalar recurrence. 4043 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4044 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4045 4046 // Create a vector from the initial value. 4047 auto *VectorInit = ScalarInit; 4048 if (VF.isVector()) { 4049 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4050 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4051 VectorInit = Builder.CreateInsertElement( 4052 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4053 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4054 } 4055 4056 // We constructed a temporary phi node in the first phase of vectorization. 4057 // This phi node will eventually be deleted. 4058 Builder.SetInsertPoint( 4059 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4060 4061 // Create a phi node for the new recurrence. The current value will either be 4062 // the initial value inserted into a vector or loop-varying vector value. 4063 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4064 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4065 4066 // Get the vectorized previous value of the last part UF - 1. It appears last 4067 // among all unrolled iterations, due to the order of their construction. 4068 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4069 4070 // Find and set the insertion point after the previous value if it is an 4071 // instruction. 4072 BasicBlock::iterator InsertPt; 4073 // Note that the previous value may have been constant-folded so it is not 4074 // guaranteed to be an instruction in the vector loop. 4075 // FIXME: Loop invariant values do not form recurrences. We should deal with 4076 // them earlier. 4077 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4078 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4079 else { 4080 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4081 if (isa<PHINode>(PreviousLastPart)) 4082 // If the previous value is a phi node, we should insert after all the phi 4083 // nodes in the block containing the PHI to avoid breaking basic block 4084 // verification. Note that the basic block may be different to 4085 // LoopVectorBody, in case we predicate the loop. 4086 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4087 else 4088 InsertPt = ++PreviousInst->getIterator(); 4089 } 4090 Builder.SetInsertPoint(&*InsertPt); 4091 4092 // We will construct a vector for the recurrence by combining the values for 4093 // the current and previous iterations. This is the required shuffle mask. 4094 assert(!VF.isScalable()); 4095 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4096 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4097 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4098 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4099 4100 // The vector from which to take the initial value for the current iteration 4101 // (actual or unrolled). Initially, this is the vector phi node. 4102 Value *Incoming = VecPhi; 4103 4104 // Shuffle the current and previous vector and update the vector parts. 4105 for (unsigned Part = 0; Part < UF; ++Part) { 4106 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4107 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4108 auto *Shuffle = 4109 VF.isVector() 4110 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4111 : Incoming; 4112 PhiPart->replaceAllUsesWith(Shuffle); 4113 cast<Instruction>(PhiPart)->eraseFromParent(); 4114 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4115 Incoming = PreviousPart; 4116 } 4117 4118 // Fix the latch value of the new recurrence in the vector loop. 4119 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4120 4121 // Extract the last vector element in the middle block. This will be the 4122 // initial value for the recurrence when jumping to the scalar loop. 4123 auto *ExtractForScalar = Incoming; 4124 if (VF.isVector()) { 4125 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4126 ExtractForScalar = Builder.CreateExtractElement( 4127 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4128 "vector.recur.extract"); 4129 } 4130 // Extract the second last element in the middle block if the 4131 // Phi is used outside the loop. We need to extract the phi itself 4132 // and not the last element (the phi update in the current iteration). This 4133 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4134 // when the scalar loop is not run at all. 4135 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4136 if (VF.isVector()) 4137 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4138 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4139 "vector.recur.extract.for.phi"); 4140 // When loop is unrolled without vectorizing, initialize 4141 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4142 // `Incoming`. This is analogous to the vectorized case above: extracting the 4143 // second last element when VF > 1. 4144 else if (UF > 1) 4145 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4146 4147 // Fix the initial value of the original recurrence in the scalar loop. 4148 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4149 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4150 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4151 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4152 Start->addIncoming(Incoming, BB); 4153 } 4154 4155 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4156 Phi->setName("scalar.recur"); 4157 4158 // Finally, fix users of the recurrence outside the loop. The users will need 4159 // either the last value of the scalar recurrence or the last value of the 4160 // vector recurrence we extracted in the middle block. Since the loop is in 4161 // LCSSA form, we just need to find all the phi nodes for the original scalar 4162 // recurrence in the exit block, and then add an edge for the middle block. 4163 // Note that LCSSA does not imply single entry when the original scalar loop 4164 // had multiple exiting edges (as we always run the last iteration in the 4165 // scalar epilogue); in that case, the exiting path through middle will be 4166 // dynamically dead and the value picked for the phi doesn't matter. 4167 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4168 if (any_of(LCSSAPhi.incoming_values(), 4169 [Phi](Value *V) { return V == Phi; })) 4170 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4171 } 4172 4173 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4174 // Get it's reduction variable descriptor. 4175 assert(Legal->isReductionVariable(Phi) && 4176 "Unable to find the reduction variable"); 4177 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4178 4179 RecurKind RK = RdxDesc.getRecurrenceKind(); 4180 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4181 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4182 setDebugLocFromInst(Builder, ReductionStartValue); 4183 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4184 4185 // This is the vector-clone of the value that leaves the loop. 4186 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4187 4188 // Wrap flags are in general invalid after vectorization, clear them. 4189 clearReductionWrapFlags(RdxDesc); 4190 4191 // Fix the vector-loop phi. 4192 4193 // Reductions do not have to start at zero. They can start with 4194 // any loop invariant values. 4195 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4196 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4197 4198 for (unsigned Part = 0; Part < UF; ++Part) { 4199 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4200 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4201 cast<PHINode>(VecRdxPhi) 4202 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4203 } 4204 4205 // Before each round, move the insertion point right between 4206 // the PHIs and the values we are going to write. 4207 // This allows us to write both PHINodes and the extractelement 4208 // instructions. 4209 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4210 4211 setDebugLocFromInst(Builder, LoopExitInst); 4212 4213 // If tail is folded by masking, the vector value to leave the loop should be 4214 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4215 // instead of the former. For an inloop reduction the reduction will already 4216 // be predicated, and does not need to be handled here. 4217 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4218 for (unsigned Part = 0; Part < UF; ++Part) { 4219 Value *VecLoopExitInst = 4220 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4221 Value *Sel = nullptr; 4222 for (User *U : VecLoopExitInst->users()) { 4223 if (isa<SelectInst>(U)) { 4224 assert(!Sel && "Reduction exit feeding two selects"); 4225 Sel = U; 4226 } else 4227 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4228 } 4229 assert(Sel && "Reduction exit feeds no select"); 4230 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4231 4232 // If the target can create a predicated operator for the reduction at no 4233 // extra cost in the loop (for example a predicated vadd), it can be 4234 // cheaper for the select to remain in the loop than be sunk out of it, 4235 // and so use the select value for the phi instead of the old 4236 // LoopExitValue. 4237 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4238 if (PreferPredicatedReductionSelect || 4239 TTI->preferPredicatedReductionSelect( 4240 RdxDesc.getOpcode(), Phi->getType(), 4241 TargetTransformInfo::ReductionFlags())) { 4242 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4243 VecRdxPhi->setIncomingValueForBlock( 4244 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4245 } 4246 } 4247 } 4248 4249 // If the vector reduction can be performed in a smaller type, we truncate 4250 // then extend the loop exit value to enable InstCombine to evaluate the 4251 // entire expression in the smaller type. 4252 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4253 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4254 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4255 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4256 Builder.SetInsertPoint( 4257 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4258 VectorParts RdxParts(UF); 4259 for (unsigned Part = 0; Part < UF; ++Part) { 4260 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4261 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4262 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4263 : Builder.CreateZExt(Trunc, VecTy); 4264 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4265 UI != RdxParts[Part]->user_end();) 4266 if (*UI != Trunc) { 4267 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4268 RdxParts[Part] = Extnd; 4269 } else { 4270 ++UI; 4271 } 4272 } 4273 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4274 for (unsigned Part = 0; Part < UF; ++Part) { 4275 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4276 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4277 } 4278 } 4279 4280 // Reduce all of the unrolled parts into a single vector. 4281 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4282 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4283 4284 // The middle block terminator has already been assigned a DebugLoc here (the 4285 // OrigLoop's single latch terminator). We want the whole middle block to 4286 // appear to execute on this line because: (a) it is all compiler generated, 4287 // (b) these instructions are always executed after evaluating the latch 4288 // conditional branch, and (c) other passes may add new predecessors which 4289 // terminate on this line. This is the easiest way to ensure we don't 4290 // accidentally cause an extra step back into the loop while debugging. 4291 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4292 for (unsigned Part = 1; Part < UF; ++Part) { 4293 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4294 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4295 // Floating point operations had to be 'fast' to enable the reduction. 4296 ReducedPartRdx = addFastMathFlag( 4297 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4298 ReducedPartRdx, "bin.rdx"), 4299 RdxDesc.getFastMathFlags()); 4300 else 4301 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4302 } 4303 4304 // Create the reduction after the loop. Note that inloop reductions create the 4305 // target reduction in the loop using a Reduction recipe. 4306 if (VF.isVector() && !IsInLoopReductionPhi) { 4307 ReducedPartRdx = 4308 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4309 // If the reduction can be performed in a smaller type, we need to extend 4310 // the reduction to the wider type before we branch to the original loop. 4311 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4312 ReducedPartRdx = 4313 RdxDesc.isSigned() 4314 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4315 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4316 } 4317 4318 // Create a phi node that merges control-flow from the backedge-taken check 4319 // block and the middle block. 4320 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4321 LoopScalarPreHeader->getTerminator()); 4322 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4323 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4324 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4325 4326 // Now, we need to fix the users of the reduction variable 4327 // inside and outside of the scalar remainder loop. 4328 4329 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4330 // in the exit blocks. See comment on analogous loop in 4331 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4332 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4333 if (any_of(LCSSAPhi.incoming_values(), 4334 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4335 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4336 4337 // Fix the scalar loop reduction variable with the incoming reduction sum 4338 // from the vector body and from the backedge value. 4339 int IncomingEdgeBlockIdx = 4340 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4341 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4342 // Pick the other block. 4343 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4344 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4345 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4346 } 4347 4348 void InnerLoopVectorizer::clearReductionWrapFlags( 4349 RecurrenceDescriptor &RdxDesc) { 4350 RecurKind RK = RdxDesc.getRecurrenceKind(); 4351 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4352 return; 4353 4354 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4355 assert(LoopExitInstr && "null loop exit instruction"); 4356 SmallVector<Instruction *, 8> Worklist; 4357 SmallPtrSet<Instruction *, 8> Visited; 4358 Worklist.push_back(LoopExitInstr); 4359 Visited.insert(LoopExitInstr); 4360 4361 while (!Worklist.empty()) { 4362 Instruction *Cur = Worklist.pop_back_val(); 4363 if (isa<OverflowingBinaryOperator>(Cur)) 4364 for (unsigned Part = 0; Part < UF; ++Part) { 4365 Value *V = getOrCreateVectorValue(Cur, Part); 4366 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4367 } 4368 4369 for (User *U : Cur->users()) { 4370 Instruction *UI = cast<Instruction>(U); 4371 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4372 Visited.insert(UI).second) 4373 Worklist.push_back(UI); 4374 } 4375 } 4376 } 4377 4378 void InnerLoopVectorizer::fixLCSSAPHIs() { 4379 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4380 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4381 // Some phis were already hand updated by the reduction and recurrence 4382 // code above, leave them alone. 4383 continue; 4384 4385 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4386 // Non-instruction incoming values will have only one value. 4387 unsigned LastLane = 0; 4388 if (isa<Instruction>(IncomingValue)) 4389 LastLane = Cost->isUniformAfterVectorization( 4390 cast<Instruction>(IncomingValue), VF) 4391 ? 0 4392 : VF.getKnownMinValue() - 1; 4393 assert((!VF.isScalable() || LastLane == 0) && 4394 "scalable vectors dont support non-uniform scalars yet"); 4395 // Can be a loop invariant incoming value or the last scalar value to be 4396 // extracted from the vectorized loop. 4397 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4398 Value *lastIncomingValue = 4399 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4400 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4401 } 4402 } 4403 4404 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4405 // The basic block and loop containing the predicated instruction. 4406 auto *PredBB = PredInst->getParent(); 4407 auto *VectorLoop = LI->getLoopFor(PredBB); 4408 4409 // Initialize a worklist with the operands of the predicated instruction. 4410 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4411 4412 // Holds instructions that we need to analyze again. An instruction may be 4413 // reanalyzed if we don't yet know if we can sink it or not. 4414 SmallVector<Instruction *, 8> InstsToReanalyze; 4415 4416 // Returns true if a given use occurs in the predicated block. Phi nodes use 4417 // their operands in their corresponding predecessor blocks. 4418 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4419 auto *I = cast<Instruction>(U.getUser()); 4420 BasicBlock *BB = I->getParent(); 4421 if (auto *Phi = dyn_cast<PHINode>(I)) 4422 BB = Phi->getIncomingBlock( 4423 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4424 return BB == PredBB; 4425 }; 4426 4427 // Iteratively sink the scalarized operands of the predicated instruction 4428 // into the block we created for it. When an instruction is sunk, it's 4429 // operands are then added to the worklist. The algorithm ends after one pass 4430 // through the worklist doesn't sink a single instruction. 4431 bool Changed; 4432 do { 4433 // Add the instructions that need to be reanalyzed to the worklist, and 4434 // reset the changed indicator. 4435 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4436 InstsToReanalyze.clear(); 4437 Changed = false; 4438 4439 while (!Worklist.empty()) { 4440 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4441 4442 // We can't sink an instruction if it is a phi node, is already in the 4443 // predicated block, is not in the loop, or may have side effects. 4444 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4445 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4446 continue; 4447 4448 // It's legal to sink the instruction if all its uses occur in the 4449 // predicated block. Otherwise, there's nothing to do yet, and we may 4450 // need to reanalyze the instruction. 4451 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4452 InstsToReanalyze.push_back(I); 4453 continue; 4454 } 4455 4456 // Move the instruction to the beginning of the predicated block, and add 4457 // it's operands to the worklist. 4458 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4459 Worklist.insert(I->op_begin(), I->op_end()); 4460 4461 // The sinking may have enabled other instructions to be sunk, so we will 4462 // need to iterate. 4463 Changed = true; 4464 } 4465 } while (Changed); 4466 } 4467 4468 void InnerLoopVectorizer::fixNonInductionPHIs() { 4469 for (PHINode *OrigPhi : OrigPHIsToFix) { 4470 PHINode *NewPhi = 4471 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4472 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4473 4474 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4475 predecessors(OrigPhi->getParent())); 4476 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4477 predecessors(NewPhi->getParent())); 4478 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4479 "Scalar and Vector BB should have the same number of predecessors"); 4480 4481 // The insertion point in Builder may be invalidated by the time we get 4482 // here. Force the Builder insertion point to something valid so that we do 4483 // not run into issues during insertion point restore in 4484 // getOrCreateVectorValue calls below. 4485 Builder.SetInsertPoint(NewPhi); 4486 4487 // The predecessor order is preserved and we can rely on mapping between 4488 // scalar and vector block predecessors. 4489 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4490 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4491 4492 // When looking up the new scalar/vector values to fix up, use incoming 4493 // values from original phi. 4494 Value *ScIncV = 4495 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4496 4497 // Scalar incoming value may need a broadcast 4498 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4499 NewPhi->addIncoming(NewIncV, NewPredBB); 4500 } 4501 } 4502 } 4503 4504 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4505 VPUser &Operands, unsigned UF, 4506 ElementCount VF, bool IsPtrLoopInvariant, 4507 SmallBitVector &IsIndexLoopInvariant, 4508 VPTransformState &State) { 4509 // Construct a vector GEP by widening the operands of the scalar GEP as 4510 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4511 // results in a vector of pointers when at least one operand of the GEP 4512 // is vector-typed. Thus, to keep the representation compact, we only use 4513 // vector-typed operands for loop-varying values. 4514 4515 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4516 // If we are vectorizing, but the GEP has only loop-invariant operands, 4517 // the GEP we build (by only using vector-typed operands for 4518 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4519 // produce a vector of pointers, we need to either arbitrarily pick an 4520 // operand to broadcast, or broadcast a clone of the original GEP. 4521 // Here, we broadcast a clone of the original. 4522 // 4523 // TODO: If at some point we decide to scalarize instructions having 4524 // loop-invariant operands, this special case will no longer be 4525 // required. We would add the scalarization decision to 4526 // collectLoopScalars() and teach getVectorValue() to broadcast 4527 // the lane-zero scalar value. 4528 auto *Clone = Builder.Insert(GEP->clone()); 4529 for (unsigned Part = 0; Part < UF; ++Part) { 4530 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4531 State.set(VPDef, GEP, EntryPart, Part); 4532 addMetadata(EntryPart, GEP); 4533 } 4534 } else { 4535 // If the GEP has at least one loop-varying operand, we are sure to 4536 // produce a vector of pointers. But if we are only unrolling, we want 4537 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4538 // produce with the code below will be scalar (if VF == 1) or vector 4539 // (otherwise). Note that for the unroll-only case, we still maintain 4540 // values in the vector mapping with initVector, as we do for other 4541 // instructions. 4542 for (unsigned Part = 0; Part < UF; ++Part) { 4543 // The pointer operand of the new GEP. If it's loop-invariant, we 4544 // won't broadcast it. 4545 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4546 : State.get(Operands.getOperand(0), Part); 4547 4548 // Collect all the indices for the new GEP. If any index is 4549 // loop-invariant, we won't broadcast it. 4550 SmallVector<Value *, 4> Indices; 4551 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4552 VPValue *Operand = Operands.getOperand(I); 4553 if (IsIndexLoopInvariant[I - 1]) 4554 Indices.push_back(State.get(Operand, {0, 0})); 4555 else 4556 Indices.push_back(State.get(Operand, Part)); 4557 } 4558 4559 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4560 // but it should be a vector, otherwise. 4561 auto *NewGEP = 4562 GEP->isInBounds() 4563 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4564 Indices) 4565 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4566 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4567 "NewGEP is not a pointer vector"); 4568 State.set(VPDef, GEP, NewGEP, Part); 4569 addMetadata(NewGEP, GEP); 4570 } 4571 } 4572 } 4573 4574 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4575 RecurrenceDescriptor *RdxDesc, 4576 Value *StartV, unsigned UF, 4577 ElementCount VF) { 4578 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4579 PHINode *P = cast<PHINode>(PN); 4580 if (EnableVPlanNativePath) { 4581 // Currently we enter here in the VPlan-native path for non-induction 4582 // PHIs where all control flow is uniform. We simply widen these PHIs. 4583 // Create a vector phi with no operands - the vector phi operands will be 4584 // set at the end of vector code generation. 4585 Type *VecTy = 4586 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4587 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4588 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4589 OrigPHIsToFix.push_back(P); 4590 4591 return; 4592 } 4593 4594 assert(PN->getParent() == OrigLoop->getHeader() && 4595 "Non-header phis should have been handled elsewhere"); 4596 4597 // In order to support recurrences we need to be able to vectorize Phi nodes. 4598 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4599 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4600 // this value when we vectorize all of the instructions that use the PHI. 4601 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4602 Value *Iden = nullptr; 4603 bool ScalarPHI = 4604 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4605 Type *VecTy = 4606 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4607 4608 if (RdxDesc) { 4609 assert(Legal->isReductionVariable(P) && StartV && 4610 "RdxDesc should only be set for reduction variables; in that case " 4611 "a StartV is also required"); 4612 RecurKind RK = RdxDesc->getRecurrenceKind(); 4613 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4614 // MinMax reduction have the start value as their identify. 4615 if (ScalarPHI) { 4616 Iden = StartV; 4617 } else { 4618 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4619 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4620 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4621 } 4622 } else { 4623 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4624 RK, VecTy->getScalarType()); 4625 Iden = IdenC; 4626 4627 if (!ScalarPHI) { 4628 Iden = ConstantVector::getSplat(VF, IdenC); 4629 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4630 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4631 Constant *Zero = Builder.getInt32(0); 4632 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4633 } 4634 } 4635 } 4636 4637 for (unsigned Part = 0; Part < UF; ++Part) { 4638 // This is phase one of vectorizing PHIs. 4639 Value *EntryPart = PHINode::Create( 4640 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4641 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4642 if (StartV) { 4643 // Make sure to add the reduction start value only to the 4644 // first unroll part. 4645 Value *StartVal = (Part == 0) ? StartV : Iden; 4646 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4647 } 4648 } 4649 return; 4650 } 4651 4652 assert(!Legal->isReductionVariable(P) && 4653 "reductions should be handled above"); 4654 4655 setDebugLocFromInst(Builder, P); 4656 4657 // This PHINode must be an induction variable. 4658 // Make sure that we know about it. 4659 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4660 4661 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4662 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4663 4664 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4665 // which can be found from the original scalar operations. 4666 switch (II.getKind()) { 4667 case InductionDescriptor::IK_NoInduction: 4668 llvm_unreachable("Unknown induction"); 4669 case InductionDescriptor::IK_IntInduction: 4670 case InductionDescriptor::IK_FpInduction: 4671 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4672 case InductionDescriptor::IK_PtrInduction: { 4673 // Handle the pointer induction variable case. 4674 assert(P->getType()->isPointerTy() && "Unexpected type."); 4675 4676 if (Cost->isScalarAfterVectorization(P, VF)) { 4677 // This is the normalized GEP that starts counting at zero. 4678 Value *PtrInd = 4679 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4680 // Determine the number of scalars we need to generate for each unroll 4681 // iteration. If the instruction is uniform, we only need to generate the 4682 // first lane. Otherwise, we generate all VF values. 4683 unsigned Lanes = 4684 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4685 for (unsigned Part = 0; Part < UF; ++Part) { 4686 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4687 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4688 Lane + Part * VF.getKnownMinValue()); 4689 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4690 Value *SclrGep = 4691 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4692 SclrGep->setName("next.gep"); 4693 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4694 } 4695 } 4696 return; 4697 } 4698 assert(isa<SCEVConstant>(II.getStep()) && 4699 "Induction step not a SCEV constant!"); 4700 Type *PhiType = II.getStep()->getType(); 4701 4702 // Build a pointer phi 4703 Value *ScalarStartValue = II.getStartValue(); 4704 Type *ScStValueType = ScalarStartValue->getType(); 4705 PHINode *NewPointerPhi = 4706 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4707 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4708 4709 // A pointer induction, performed by using a gep 4710 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4711 Instruction *InductionLoc = LoopLatch->getTerminator(); 4712 const SCEV *ScalarStep = II.getStep(); 4713 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4714 Value *ScalarStepValue = 4715 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4716 Value *InductionGEP = GetElementPtrInst::Create( 4717 ScStValueType->getPointerElementType(), NewPointerPhi, 4718 Builder.CreateMul( 4719 ScalarStepValue, 4720 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4721 "ptr.ind", InductionLoc); 4722 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4723 4724 // Create UF many actual address geps that use the pointer 4725 // phi as base and a vectorized version of the step value 4726 // (<step*0, ..., step*N>) as offset. 4727 for (unsigned Part = 0; Part < UF; ++Part) { 4728 SmallVector<Constant *, 8> Indices; 4729 // Create a vector of consecutive numbers from zero to VF. 4730 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4731 Indices.push_back( 4732 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4733 Constant *StartOffset = ConstantVector::get(Indices); 4734 4735 Value *GEP = Builder.CreateGEP( 4736 ScStValueType->getPointerElementType(), NewPointerPhi, 4737 Builder.CreateMul( 4738 StartOffset, 4739 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4740 "vector.gep")); 4741 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4742 } 4743 } 4744 } 4745 } 4746 4747 /// A helper function for checking whether an integer division-related 4748 /// instruction may divide by zero (in which case it must be predicated if 4749 /// executed conditionally in the scalar code). 4750 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4751 /// Non-zero divisors that are non compile-time constants will not be 4752 /// converted into multiplication, so we will still end up scalarizing 4753 /// the division, but can do so w/o predication. 4754 static bool mayDivideByZero(Instruction &I) { 4755 assert((I.getOpcode() == Instruction::UDiv || 4756 I.getOpcode() == Instruction::SDiv || 4757 I.getOpcode() == Instruction::URem || 4758 I.getOpcode() == Instruction::SRem) && 4759 "Unexpected instruction"); 4760 Value *Divisor = I.getOperand(1); 4761 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4762 return !CInt || CInt->isZero(); 4763 } 4764 4765 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4766 VPUser &User, 4767 VPTransformState &State) { 4768 switch (I.getOpcode()) { 4769 case Instruction::Call: 4770 case Instruction::Br: 4771 case Instruction::PHI: 4772 case Instruction::GetElementPtr: 4773 case Instruction::Select: 4774 llvm_unreachable("This instruction is handled by a different recipe."); 4775 case Instruction::UDiv: 4776 case Instruction::SDiv: 4777 case Instruction::SRem: 4778 case Instruction::URem: 4779 case Instruction::Add: 4780 case Instruction::FAdd: 4781 case Instruction::Sub: 4782 case Instruction::FSub: 4783 case Instruction::FNeg: 4784 case Instruction::Mul: 4785 case Instruction::FMul: 4786 case Instruction::FDiv: 4787 case Instruction::FRem: 4788 case Instruction::Shl: 4789 case Instruction::LShr: 4790 case Instruction::AShr: 4791 case Instruction::And: 4792 case Instruction::Or: 4793 case Instruction::Xor: { 4794 // Just widen unops and binops. 4795 setDebugLocFromInst(Builder, &I); 4796 4797 for (unsigned Part = 0; Part < UF; ++Part) { 4798 SmallVector<Value *, 2> Ops; 4799 for (VPValue *VPOp : User.operands()) 4800 Ops.push_back(State.get(VPOp, Part)); 4801 4802 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4803 4804 if (auto *VecOp = dyn_cast<Instruction>(V)) 4805 VecOp->copyIRFlags(&I); 4806 4807 // Use this vector value for all users of the original instruction. 4808 State.set(Def, &I, V, Part); 4809 addMetadata(V, &I); 4810 } 4811 4812 break; 4813 } 4814 case Instruction::ICmp: 4815 case Instruction::FCmp: { 4816 // Widen compares. Generate vector compares. 4817 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4818 auto *Cmp = cast<CmpInst>(&I); 4819 setDebugLocFromInst(Builder, Cmp); 4820 for (unsigned Part = 0; Part < UF; ++Part) { 4821 Value *A = State.get(User.getOperand(0), Part); 4822 Value *B = State.get(User.getOperand(1), Part); 4823 Value *C = nullptr; 4824 if (FCmp) { 4825 // Propagate fast math flags. 4826 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4827 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4828 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4829 } else { 4830 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4831 } 4832 State.set(Def, &I, C, Part); 4833 addMetadata(C, &I); 4834 } 4835 4836 break; 4837 } 4838 4839 case Instruction::ZExt: 4840 case Instruction::SExt: 4841 case Instruction::FPToUI: 4842 case Instruction::FPToSI: 4843 case Instruction::FPExt: 4844 case Instruction::PtrToInt: 4845 case Instruction::IntToPtr: 4846 case Instruction::SIToFP: 4847 case Instruction::UIToFP: 4848 case Instruction::Trunc: 4849 case Instruction::FPTrunc: 4850 case Instruction::BitCast: { 4851 auto *CI = cast<CastInst>(&I); 4852 setDebugLocFromInst(Builder, CI); 4853 4854 /// Vectorize casts. 4855 Type *DestTy = 4856 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4857 4858 for (unsigned Part = 0; Part < UF; ++Part) { 4859 Value *A = State.get(User.getOperand(0), Part); 4860 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4861 State.set(Def, &I, Cast, Part); 4862 addMetadata(Cast, &I); 4863 } 4864 break; 4865 } 4866 default: 4867 // This instruction is not vectorized by simple widening. 4868 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4869 llvm_unreachable("Unhandled instruction!"); 4870 } // end of switch. 4871 } 4872 4873 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4874 VPUser &ArgOperands, 4875 VPTransformState &State) { 4876 assert(!isa<DbgInfoIntrinsic>(I) && 4877 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4878 setDebugLocFromInst(Builder, &I); 4879 4880 Module *M = I.getParent()->getParent()->getParent(); 4881 auto *CI = cast<CallInst>(&I); 4882 4883 SmallVector<Type *, 4> Tys; 4884 for (Value *ArgOperand : CI->arg_operands()) 4885 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4886 4887 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4888 4889 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4890 // version of the instruction. 4891 // Is it beneficial to perform intrinsic call compared to lib call? 4892 bool NeedToScalarize = false; 4893 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4894 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4895 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4896 assert((UseVectorIntrinsic || !NeedToScalarize) && 4897 "Instruction should be scalarized elsewhere."); 4898 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4899 "Cannot have invalid costs while widening"); 4900 4901 for (unsigned Part = 0; Part < UF; ++Part) { 4902 SmallVector<Value *, 4> Args; 4903 for (auto &I : enumerate(ArgOperands.operands())) { 4904 // Some intrinsics have a scalar argument - don't replace it with a 4905 // vector. 4906 Value *Arg; 4907 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4908 Arg = State.get(I.value(), Part); 4909 else 4910 Arg = State.get(I.value(), {0, 0}); 4911 Args.push_back(Arg); 4912 } 4913 4914 Function *VectorF; 4915 if (UseVectorIntrinsic) { 4916 // Use vector version of the intrinsic. 4917 Type *TysForDecl[] = {CI->getType()}; 4918 if (VF.isVector()) { 4919 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4920 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4921 } 4922 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4923 assert(VectorF && "Can't retrieve vector intrinsic."); 4924 } else { 4925 // Use vector version of the function call. 4926 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4927 #ifndef NDEBUG 4928 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4929 "Can't create vector function."); 4930 #endif 4931 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4932 } 4933 SmallVector<OperandBundleDef, 1> OpBundles; 4934 CI->getOperandBundlesAsDefs(OpBundles); 4935 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4936 4937 if (isa<FPMathOperator>(V)) 4938 V->copyFastMathFlags(CI); 4939 4940 State.set(Def, &I, V, Part); 4941 addMetadata(V, &I); 4942 } 4943 } 4944 4945 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4946 VPUser &Operands, 4947 bool InvariantCond, 4948 VPTransformState &State) { 4949 setDebugLocFromInst(Builder, &I); 4950 4951 // The condition can be loop invariant but still defined inside the 4952 // loop. This means that we can't just use the original 'cond' value. 4953 // We have to take the 'vectorized' value and pick the first lane. 4954 // Instcombine will make this a no-op. 4955 auto *InvarCond = 4956 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4957 4958 for (unsigned Part = 0; Part < UF; ++Part) { 4959 Value *Cond = 4960 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4961 Value *Op0 = State.get(Operands.getOperand(1), Part); 4962 Value *Op1 = State.get(Operands.getOperand(2), Part); 4963 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4964 State.set(VPDef, &I, Sel, Part); 4965 addMetadata(Sel, &I); 4966 } 4967 } 4968 4969 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4970 // We should not collect Scalars more than once per VF. Right now, this 4971 // function is called from collectUniformsAndScalars(), which already does 4972 // this check. Collecting Scalars for VF=1 does not make any sense. 4973 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4974 "This function should not be visited twice for the same VF"); 4975 4976 SmallSetVector<Instruction *, 8> Worklist; 4977 4978 // These sets are used to seed the analysis with pointers used by memory 4979 // accesses that will remain scalar. 4980 SmallSetVector<Instruction *, 8> ScalarPtrs; 4981 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4982 auto *Latch = TheLoop->getLoopLatch(); 4983 4984 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4985 // The pointer operands of loads and stores will be scalar as long as the 4986 // memory access is not a gather or scatter operation. The value operand of a 4987 // store will remain scalar if the store is scalarized. 4988 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4989 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4990 assert(WideningDecision != CM_Unknown && 4991 "Widening decision should be ready at this moment"); 4992 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4993 if (Ptr == Store->getValueOperand()) 4994 return WideningDecision == CM_Scalarize; 4995 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4996 "Ptr is neither a value or pointer operand"); 4997 return WideningDecision != CM_GatherScatter; 4998 }; 4999 5000 // A helper that returns true if the given value is a bitcast or 5001 // getelementptr instruction contained in the loop. 5002 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5003 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5004 isa<GetElementPtrInst>(V)) && 5005 !TheLoop->isLoopInvariant(V); 5006 }; 5007 5008 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5009 if (!isa<PHINode>(Ptr) || 5010 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5011 return false; 5012 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5013 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5014 return false; 5015 return isScalarUse(MemAccess, Ptr); 5016 }; 5017 5018 // A helper that evaluates a memory access's use of a pointer. If the 5019 // pointer is actually the pointer induction of a loop, it is being 5020 // inserted into Worklist. If the use will be a scalar use, and the 5021 // pointer is only used by memory accesses, we place the pointer in 5022 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5023 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5024 if (isScalarPtrInduction(MemAccess, Ptr)) { 5025 Worklist.insert(cast<Instruction>(Ptr)); 5026 Instruction *Update = cast<Instruction>( 5027 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5028 Worklist.insert(Update); 5029 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5030 << "\n"); 5031 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5032 << "\n"); 5033 return; 5034 } 5035 // We only care about bitcast and getelementptr instructions contained in 5036 // the loop. 5037 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5038 return; 5039 5040 // If the pointer has already been identified as scalar (e.g., if it was 5041 // also identified as uniform), there's nothing to do. 5042 auto *I = cast<Instruction>(Ptr); 5043 if (Worklist.count(I)) 5044 return; 5045 5046 // If the use of the pointer will be a scalar use, and all users of the 5047 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5048 // place the pointer in PossibleNonScalarPtrs. 5049 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5050 return isa<LoadInst>(U) || isa<StoreInst>(U); 5051 })) 5052 ScalarPtrs.insert(I); 5053 else 5054 PossibleNonScalarPtrs.insert(I); 5055 }; 5056 5057 // We seed the scalars analysis with three classes of instructions: (1) 5058 // instructions marked uniform-after-vectorization and (2) bitcast, 5059 // getelementptr and (pointer) phi instructions used by memory accesses 5060 // requiring a scalar use. 5061 // 5062 // (1) Add to the worklist all instructions that have been identified as 5063 // uniform-after-vectorization. 5064 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5065 5066 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5067 // memory accesses requiring a scalar use. The pointer operands of loads and 5068 // stores will be scalar as long as the memory accesses is not a gather or 5069 // scatter operation. The value operand of a store will remain scalar if the 5070 // store is scalarized. 5071 for (auto *BB : TheLoop->blocks()) 5072 for (auto &I : *BB) { 5073 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5074 evaluatePtrUse(Load, Load->getPointerOperand()); 5075 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5076 evaluatePtrUse(Store, Store->getPointerOperand()); 5077 evaluatePtrUse(Store, Store->getValueOperand()); 5078 } 5079 } 5080 for (auto *I : ScalarPtrs) 5081 if (!PossibleNonScalarPtrs.count(I)) { 5082 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5083 Worklist.insert(I); 5084 } 5085 5086 // Insert the forced scalars. 5087 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5088 // induction variable when the PHI user is scalarized. 5089 auto ForcedScalar = ForcedScalars.find(VF); 5090 if (ForcedScalar != ForcedScalars.end()) 5091 for (auto *I : ForcedScalar->second) 5092 Worklist.insert(I); 5093 5094 // Expand the worklist by looking through any bitcasts and getelementptr 5095 // instructions we've already identified as scalar. This is similar to the 5096 // expansion step in collectLoopUniforms(); however, here we're only 5097 // expanding to include additional bitcasts and getelementptr instructions. 5098 unsigned Idx = 0; 5099 while (Idx != Worklist.size()) { 5100 Instruction *Dst = Worklist[Idx++]; 5101 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5102 continue; 5103 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5104 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5105 auto *J = cast<Instruction>(U); 5106 return !TheLoop->contains(J) || Worklist.count(J) || 5107 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5108 isScalarUse(J, Src)); 5109 })) { 5110 Worklist.insert(Src); 5111 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5112 } 5113 } 5114 5115 // An induction variable will remain scalar if all users of the induction 5116 // variable and induction variable update remain scalar. 5117 for (auto &Induction : Legal->getInductionVars()) { 5118 auto *Ind = Induction.first; 5119 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5120 5121 // If tail-folding is applied, the primary induction variable will be used 5122 // to feed a vector compare. 5123 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5124 continue; 5125 5126 // Determine if all users of the induction variable are scalar after 5127 // vectorization. 5128 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5129 auto *I = cast<Instruction>(U); 5130 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5131 }); 5132 if (!ScalarInd) 5133 continue; 5134 5135 // Determine if all users of the induction variable update instruction are 5136 // scalar after vectorization. 5137 auto ScalarIndUpdate = 5138 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5139 auto *I = cast<Instruction>(U); 5140 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5141 }); 5142 if (!ScalarIndUpdate) 5143 continue; 5144 5145 // The induction variable and its update instruction will remain scalar. 5146 Worklist.insert(Ind); 5147 Worklist.insert(IndUpdate); 5148 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5149 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5150 << "\n"); 5151 } 5152 5153 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5154 } 5155 5156 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5157 ElementCount VF) { 5158 if (!blockNeedsPredication(I->getParent())) 5159 return false; 5160 switch(I->getOpcode()) { 5161 default: 5162 break; 5163 case Instruction::Load: 5164 case Instruction::Store: { 5165 if (!Legal->isMaskRequired(I)) 5166 return false; 5167 auto *Ptr = getLoadStorePointerOperand(I); 5168 auto *Ty = getMemInstValueType(I); 5169 // We have already decided how to vectorize this instruction, get that 5170 // result. 5171 if (VF.isVector()) { 5172 InstWidening WideningDecision = getWideningDecision(I, VF); 5173 assert(WideningDecision != CM_Unknown && 5174 "Widening decision should be ready at this moment"); 5175 return WideningDecision == CM_Scalarize; 5176 } 5177 const Align Alignment = getLoadStoreAlignment(I); 5178 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5179 isLegalMaskedGather(Ty, Alignment)) 5180 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5181 isLegalMaskedScatter(Ty, Alignment)); 5182 } 5183 case Instruction::UDiv: 5184 case Instruction::SDiv: 5185 case Instruction::SRem: 5186 case Instruction::URem: 5187 return mayDivideByZero(*I); 5188 } 5189 return false; 5190 } 5191 5192 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5193 Instruction *I, ElementCount VF) { 5194 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5195 assert(getWideningDecision(I, VF) == CM_Unknown && 5196 "Decision should not be set yet."); 5197 auto *Group = getInterleavedAccessGroup(I); 5198 assert(Group && "Must have a group."); 5199 5200 // If the instruction's allocated size doesn't equal it's type size, it 5201 // requires padding and will be scalarized. 5202 auto &DL = I->getModule()->getDataLayout(); 5203 auto *ScalarTy = getMemInstValueType(I); 5204 if (hasIrregularType(ScalarTy, DL, VF)) 5205 return false; 5206 5207 // Check if masking is required. 5208 // A Group may need masking for one of two reasons: it resides in a block that 5209 // needs predication, or it was decided to use masking to deal with gaps. 5210 bool PredicatedAccessRequiresMasking = 5211 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5212 bool AccessWithGapsRequiresMasking = 5213 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5214 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5215 return true; 5216 5217 // If masked interleaving is required, we expect that the user/target had 5218 // enabled it, because otherwise it either wouldn't have been created or 5219 // it should have been invalidated by the CostModel. 5220 assert(useMaskedInterleavedAccesses(TTI) && 5221 "Masked interleave-groups for predicated accesses are not enabled."); 5222 5223 auto *Ty = getMemInstValueType(I); 5224 const Align Alignment = getLoadStoreAlignment(I); 5225 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5226 : TTI.isLegalMaskedStore(Ty, Alignment); 5227 } 5228 5229 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5230 Instruction *I, ElementCount VF) { 5231 // Get and ensure we have a valid memory instruction. 5232 LoadInst *LI = dyn_cast<LoadInst>(I); 5233 StoreInst *SI = dyn_cast<StoreInst>(I); 5234 assert((LI || SI) && "Invalid memory instruction"); 5235 5236 auto *Ptr = getLoadStorePointerOperand(I); 5237 5238 // In order to be widened, the pointer should be consecutive, first of all. 5239 if (!Legal->isConsecutivePtr(Ptr)) 5240 return false; 5241 5242 // If the instruction is a store located in a predicated block, it will be 5243 // scalarized. 5244 if (isScalarWithPredication(I)) 5245 return false; 5246 5247 // If the instruction's allocated size doesn't equal it's type size, it 5248 // requires padding and will be scalarized. 5249 auto &DL = I->getModule()->getDataLayout(); 5250 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5251 if (hasIrregularType(ScalarTy, DL, VF)) 5252 return false; 5253 5254 return true; 5255 } 5256 5257 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5258 // We should not collect Uniforms more than once per VF. Right now, 5259 // this function is called from collectUniformsAndScalars(), which 5260 // already does this check. Collecting Uniforms for VF=1 does not make any 5261 // sense. 5262 5263 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5264 "This function should not be visited twice for the same VF"); 5265 5266 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5267 // not analyze again. Uniforms.count(VF) will return 1. 5268 Uniforms[VF].clear(); 5269 5270 // We now know that the loop is vectorizable! 5271 // Collect instructions inside the loop that will remain uniform after 5272 // vectorization. 5273 5274 // Global values, params and instructions outside of current loop are out of 5275 // scope. 5276 auto isOutOfScope = [&](Value *V) -> bool { 5277 Instruction *I = dyn_cast<Instruction>(V); 5278 return (!I || !TheLoop->contains(I)); 5279 }; 5280 5281 SetVector<Instruction *> Worklist; 5282 BasicBlock *Latch = TheLoop->getLoopLatch(); 5283 5284 // Instructions that are scalar with predication must not be considered 5285 // uniform after vectorization, because that would create an erroneous 5286 // replicating region where only a single instance out of VF should be formed. 5287 // TODO: optimize such seldom cases if found important, see PR40816. 5288 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5289 if (isOutOfScope(I)) { 5290 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5291 << *I << "\n"); 5292 return; 5293 } 5294 if (isScalarWithPredication(I, VF)) { 5295 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5296 << *I << "\n"); 5297 return; 5298 } 5299 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5300 Worklist.insert(I); 5301 }; 5302 5303 // Start with the conditional branch. If the branch condition is an 5304 // instruction contained in the loop that is only used by the branch, it is 5305 // uniform. 5306 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5307 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5308 addToWorklistIfAllowed(Cmp); 5309 5310 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5311 InstWidening WideningDecision = getWideningDecision(I, VF); 5312 assert(WideningDecision != CM_Unknown && 5313 "Widening decision should be ready at this moment"); 5314 5315 // A uniform memory op is itself uniform. We exclude uniform stores 5316 // here as they demand the last lane, not the first one. 5317 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5318 assert(WideningDecision == CM_Scalarize); 5319 return true; 5320 } 5321 5322 return (WideningDecision == CM_Widen || 5323 WideningDecision == CM_Widen_Reverse || 5324 WideningDecision == CM_Interleave); 5325 }; 5326 5327 5328 // Returns true if Ptr is the pointer operand of a memory access instruction 5329 // I, and I is known to not require scalarization. 5330 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5331 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5332 }; 5333 5334 // Holds a list of values which are known to have at least one uniform use. 5335 // Note that there may be other uses which aren't uniform. A "uniform use" 5336 // here is something which only demands lane 0 of the unrolled iterations; 5337 // it does not imply that all lanes produce the same value (e.g. this is not 5338 // the usual meaning of uniform) 5339 SmallPtrSet<Value *, 8> HasUniformUse; 5340 5341 // Scan the loop for instructions which are either a) known to have only 5342 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5343 for (auto *BB : TheLoop->blocks()) 5344 for (auto &I : *BB) { 5345 // If there's no pointer operand, there's nothing to do. 5346 auto *Ptr = getLoadStorePointerOperand(&I); 5347 if (!Ptr) 5348 continue; 5349 5350 // A uniform memory op is itself uniform. We exclude uniform stores 5351 // here as they demand the last lane, not the first one. 5352 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5353 addToWorklistIfAllowed(&I); 5354 5355 if (isUniformDecision(&I, VF)) { 5356 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5357 HasUniformUse.insert(Ptr); 5358 } 5359 } 5360 5361 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5362 // demanding) users. Since loops are assumed to be in LCSSA form, this 5363 // disallows uses outside the loop as well. 5364 for (auto *V : HasUniformUse) { 5365 if (isOutOfScope(V)) 5366 continue; 5367 auto *I = cast<Instruction>(V); 5368 auto UsersAreMemAccesses = 5369 llvm::all_of(I->users(), [&](User *U) -> bool { 5370 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5371 }); 5372 if (UsersAreMemAccesses) 5373 addToWorklistIfAllowed(I); 5374 } 5375 5376 // Expand Worklist in topological order: whenever a new instruction 5377 // is added , its users should be already inside Worklist. It ensures 5378 // a uniform instruction will only be used by uniform instructions. 5379 unsigned idx = 0; 5380 while (idx != Worklist.size()) { 5381 Instruction *I = Worklist[idx++]; 5382 5383 for (auto OV : I->operand_values()) { 5384 // isOutOfScope operands cannot be uniform instructions. 5385 if (isOutOfScope(OV)) 5386 continue; 5387 // First order recurrence Phi's should typically be considered 5388 // non-uniform. 5389 auto *OP = dyn_cast<PHINode>(OV); 5390 if (OP && Legal->isFirstOrderRecurrence(OP)) 5391 continue; 5392 // If all the users of the operand are uniform, then add the 5393 // operand into the uniform worklist. 5394 auto *OI = cast<Instruction>(OV); 5395 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5396 auto *J = cast<Instruction>(U); 5397 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5398 })) 5399 addToWorklistIfAllowed(OI); 5400 } 5401 } 5402 5403 // For an instruction to be added into Worklist above, all its users inside 5404 // the loop should also be in Worklist. However, this condition cannot be 5405 // true for phi nodes that form a cyclic dependence. We must process phi 5406 // nodes separately. An induction variable will remain uniform if all users 5407 // of the induction variable and induction variable update remain uniform. 5408 // The code below handles both pointer and non-pointer induction variables. 5409 for (auto &Induction : Legal->getInductionVars()) { 5410 auto *Ind = Induction.first; 5411 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5412 5413 // Determine if all users of the induction variable are uniform after 5414 // vectorization. 5415 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5416 auto *I = cast<Instruction>(U); 5417 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5418 isVectorizedMemAccessUse(I, Ind); 5419 }); 5420 if (!UniformInd) 5421 continue; 5422 5423 // Determine if all users of the induction variable update instruction are 5424 // uniform after vectorization. 5425 auto UniformIndUpdate = 5426 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5427 auto *I = cast<Instruction>(U); 5428 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5429 isVectorizedMemAccessUse(I, IndUpdate); 5430 }); 5431 if (!UniformIndUpdate) 5432 continue; 5433 5434 // The induction variable and its update instruction will remain uniform. 5435 addToWorklistIfAllowed(Ind); 5436 addToWorklistIfAllowed(IndUpdate); 5437 } 5438 5439 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5440 } 5441 5442 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5443 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5444 5445 if (Legal->getRuntimePointerChecking()->Need) { 5446 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5447 "runtime pointer checks needed. Enable vectorization of this " 5448 "loop with '#pragma clang loop vectorize(enable)' when " 5449 "compiling with -Os/-Oz", 5450 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5451 return true; 5452 } 5453 5454 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5455 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5456 "runtime SCEV checks needed. Enable vectorization of this " 5457 "loop with '#pragma clang loop vectorize(enable)' when " 5458 "compiling with -Os/-Oz", 5459 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5460 return true; 5461 } 5462 5463 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5464 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5465 reportVectorizationFailure("Runtime stride check for small trip count", 5466 "runtime stride == 1 checks needed. Enable vectorization of " 5467 "this loop without such check by compiling with -Os/-Oz", 5468 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5469 return true; 5470 } 5471 5472 return false; 5473 } 5474 5475 Optional<ElementCount> 5476 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5477 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5478 // TODO: It may by useful to do since it's still likely to be dynamically 5479 // uniform if the target can skip. 5480 reportVectorizationFailure( 5481 "Not inserting runtime ptr check for divergent target", 5482 "runtime pointer checks needed. Not enabled for divergent target", 5483 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5484 return None; 5485 } 5486 5487 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5488 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5489 if (TC == 1) { 5490 reportVectorizationFailure("Single iteration (non) loop", 5491 "loop trip count is one, irrelevant for vectorization", 5492 "SingleIterationLoop", ORE, TheLoop); 5493 return None; 5494 } 5495 5496 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5497 5498 switch (ScalarEpilogueStatus) { 5499 case CM_ScalarEpilogueAllowed: 5500 return MaxVF; 5501 case CM_ScalarEpilogueNotAllowedUsePredicate: 5502 LLVM_FALLTHROUGH; 5503 case CM_ScalarEpilogueNotNeededUsePredicate: 5504 LLVM_DEBUG( 5505 dbgs() << "LV: vector predicate hint/switch found.\n" 5506 << "LV: Not allowing scalar epilogue, creating predicated " 5507 << "vector loop.\n"); 5508 break; 5509 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5510 // fallthrough as a special case of OptForSize 5511 case CM_ScalarEpilogueNotAllowedOptSize: 5512 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5513 LLVM_DEBUG( 5514 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5515 else 5516 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5517 << "count.\n"); 5518 5519 // Bail if runtime checks are required, which are not good when optimising 5520 // for size. 5521 if (runtimeChecksRequired()) 5522 return None; 5523 5524 break; 5525 } 5526 5527 // The only loops we can vectorize without a scalar epilogue, are loops with 5528 // a bottom-test and a single exiting block. We'd have to handle the fact 5529 // that not every instruction executes on the last iteration. This will 5530 // require a lane mask which varies through the vector loop body. (TODO) 5531 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5532 // If there was a tail-folding hint/switch, but we can't fold the tail by 5533 // masking, fallback to a vectorization with a scalar epilogue. 5534 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5535 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5536 "scalar epilogue instead.\n"); 5537 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5538 return MaxVF; 5539 } 5540 return None; 5541 } 5542 5543 // Now try the tail folding 5544 5545 // Invalidate interleave groups that require an epilogue if we can't mask 5546 // the interleave-group. 5547 if (!useMaskedInterleavedAccesses(TTI)) { 5548 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5549 "No decisions should have been taken at this point"); 5550 // Note: There is no need to invalidate any cost modeling decisions here, as 5551 // non where taken so far. 5552 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5553 } 5554 5555 assert(!MaxVF.isScalable() && 5556 "Scalable vectors do not yet support tail folding"); 5557 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5558 "MaxVF must be a power of 2"); 5559 unsigned MaxVFtimesIC = 5560 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5561 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5562 // chose. 5563 ScalarEvolution *SE = PSE.getSE(); 5564 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5565 const SCEV *ExitCount = SE->getAddExpr( 5566 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5567 const SCEV *Rem = SE->getURemExpr( 5568 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5569 if (Rem->isZero()) { 5570 // Accept MaxVF if we do not have a tail. 5571 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5572 return MaxVF; 5573 } 5574 5575 // If we don't know the precise trip count, or if the trip count that we 5576 // found modulo the vectorization factor is not zero, try to fold the tail 5577 // by masking. 5578 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5579 if (Legal->prepareToFoldTailByMasking()) { 5580 FoldTailByMasking = true; 5581 return MaxVF; 5582 } 5583 5584 // If there was a tail-folding hint/switch, but we can't fold the tail by 5585 // masking, fallback to a vectorization with a scalar epilogue. 5586 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5587 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5588 "scalar epilogue instead.\n"); 5589 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5590 return MaxVF; 5591 } 5592 5593 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5594 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5595 return None; 5596 } 5597 5598 if (TC == 0) { 5599 reportVectorizationFailure( 5600 "Unable to calculate the loop count due to complex control flow", 5601 "unable to calculate the loop count due to complex control flow", 5602 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5603 return None; 5604 } 5605 5606 reportVectorizationFailure( 5607 "Cannot optimize for size and vectorize at the same time.", 5608 "cannot optimize for size and vectorize at the same time. " 5609 "Enable vectorization of this loop with '#pragma clang loop " 5610 "vectorize(enable)' when compiling with -Os/-Oz", 5611 "NoTailLoopWithOptForSize", ORE, TheLoop); 5612 return None; 5613 } 5614 5615 ElementCount 5616 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5617 ElementCount UserVF) { 5618 bool IgnoreScalableUserVF = UserVF.isScalable() && 5619 !TTI.supportsScalableVectors() && 5620 !ForceTargetSupportsScalableVectors; 5621 if (IgnoreScalableUserVF) { 5622 LLVM_DEBUG( 5623 dbgs() << "LV: Ignoring VF=" << UserVF 5624 << " because target does not support scalable vectors.\n"); 5625 ORE->emit([&]() { 5626 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5627 TheLoop->getStartLoc(), 5628 TheLoop->getHeader()) 5629 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5630 << " because target does not support scalable vectors."; 5631 }); 5632 } 5633 5634 // Beyond this point two scenarios are handled. If UserVF isn't specified 5635 // then a suitable VF is chosen. If UserVF is specified and there are 5636 // dependencies, check if it's legal. However, if a UserVF is specified and 5637 // there are no dependencies, then there's nothing to do. 5638 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5639 Legal->isSafeForAnyVectorWidth()) 5640 return UserVF; 5641 5642 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5643 unsigned SmallestType, WidestType; 5644 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5645 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5646 5647 // Get the maximum safe dependence distance in bits computed by LAA. 5648 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5649 // the memory accesses that is most restrictive (involved in the smallest 5650 // dependence distance). 5651 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5652 5653 // If the user vectorization factor is legally unsafe, clamp it to a safe 5654 // value. Otherwise, return as is. 5655 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5656 unsigned MaxSafeElements = 5657 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5658 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5659 5660 if (UserVF.isScalable()) { 5661 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5662 5663 // Scale VF by vscale before checking if it's safe. 5664 MaxSafeVF = ElementCount::getScalable( 5665 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5666 5667 if (MaxSafeVF.isZero()) { 5668 // The dependence distance is too small to use scalable vectors, 5669 // fallback on fixed. 5670 LLVM_DEBUG( 5671 dbgs() 5672 << "LV: Max legal vector width too small, scalable vectorization " 5673 "unfeasible. Using fixed-width vectorization instead.\n"); 5674 ORE->emit([&]() { 5675 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5676 TheLoop->getStartLoc(), 5677 TheLoop->getHeader()) 5678 << "Max legal vector width too small, scalable vectorization " 5679 << "unfeasible. Using fixed-width vectorization instead."; 5680 }); 5681 return computeFeasibleMaxVF( 5682 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5683 } 5684 } 5685 5686 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5687 5688 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5689 return UserVF; 5690 5691 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5692 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5693 << ".\n"); 5694 ORE->emit([&]() { 5695 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5696 TheLoop->getStartLoc(), 5697 TheLoop->getHeader()) 5698 << "User-specified vectorization factor " 5699 << ore::NV("UserVectorizationFactor", UserVF) 5700 << " is unsafe, clamping to maximum safe vectorization factor " 5701 << ore::NV("VectorizationFactor", MaxSafeVF); 5702 }); 5703 return MaxSafeVF; 5704 } 5705 5706 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5707 5708 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5709 // Note that both WidestRegister and WidestType may not be a powers of 2. 5710 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5711 5712 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5713 << " / " << WidestType << " bits.\n"); 5714 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5715 << WidestRegister << " bits.\n"); 5716 5717 assert(MaxVectorSize <= WidestRegister && 5718 "Did not expect to pack so many elements" 5719 " into one vector!"); 5720 if (MaxVectorSize == 0) { 5721 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5722 MaxVectorSize = 1; 5723 return ElementCount::getFixed(MaxVectorSize); 5724 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5725 isPowerOf2_32(ConstTripCount)) { 5726 // We need to clamp the VF to be the ConstTripCount. There is no point in 5727 // choosing a higher viable VF as done in the loop below. 5728 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5729 << ConstTripCount << "\n"); 5730 MaxVectorSize = ConstTripCount; 5731 return ElementCount::getFixed(MaxVectorSize); 5732 } 5733 5734 unsigned MaxVF = MaxVectorSize; 5735 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5736 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5737 // Collect all viable vectorization factors larger than the default MaxVF 5738 // (i.e. MaxVectorSize). 5739 SmallVector<ElementCount, 8> VFs; 5740 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5741 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5742 VFs.push_back(ElementCount::getFixed(VS)); 5743 5744 // For each VF calculate its register usage. 5745 auto RUs = calculateRegisterUsage(VFs); 5746 5747 // Select the largest VF which doesn't require more registers than existing 5748 // ones. 5749 for (int i = RUs.size() - 1; i >= 0; --i) { 5750 bool Selected = true; 5751 for (auto& pair : RUs[i].MaxLocalUsers) { 5752 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5753 if (pair.second > TargetNumRegisters) 5754 Selected = false; 5755 } 5756 if (Selected) { 5757 MaxVF = VFs[i].getKnownMinValue(); 5758 break; 5759 } 5760 } 5761 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5762 if (MaxVF < MinVF) { 5763 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5764 << ") with target's minimum: " << MinVF << '\n'); 5765 MaxVF = MinVF; 5766 } 5767 } 5768 } 5769 return ElementCount::getFixed(MaxVF); 5770 } 5771 5772 VectorizationFactor 5773 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5774 // FIXME: This can be fixed for scalable vectors later, because at this stage 5775 // the LoopVectorizer will only consider vectorizing a loop with scalable 5776 // vectors when the loop has a hint to enable vectorization for a given VF. 5777 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5778 5779 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5780 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5781 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5782 5783 unsigned Width = 1; 5784 const float ScalarCost = *ExpectedCost.getValue(); 5785 float Cost = ScalarCost; 5786 5787 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5788 if (ForceVectorization && MaxVF.isVector()) { 5789 // Ignore scalar width, because the user explicitly wants vectorization. 5790 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5791 // evaluation. 5792 Cost = std::numeric_limits<float>::max(); 5793 } 5794 5795 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5796 // Notice that the vector loop needs to be executed less times, so 5797 // we need to divide the cost of the vector loops by the width of 5798 // the vector elements. 5799 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5800 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5801 float VectorCost = *C.first.getValue() / (float)i; 5802 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5803 << " costs: " << (int)VectorCost << ".\n"); 5804 if (!C.second && !ForceVectorization) { 5805 LLVM_DEBUG( 5806 dbgs() << "LV: Not considering vector loop of width " << i 5807 << " because it will not generate any vector instructions.\n"); 5808 continue; 5809 } 5810 5811 // If profitable add it to ProfitableVF list. 5812 if (VectorCost < ScalarCost) { 5813 ProfitableVFs.push_back(VectorizationFactor( 5814 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5815 } 5816 5817 if (VectorCost < Cost) { 5818 Cost = VectorCost; 5819 Width = i; 5820 } 5821 } 5822 5823 if (!EnableCondStoresVectorization && NumPredStores) { 5824 reportVectorizationFailure("There are conditional stores.", 5825 "store that is conditionally executed prevents vectorization", 5826 "ConditionalStore", ORE, TheLoop); 5827 Width = 1; 5828 Cost = ScalarCost; 5829 } 5830 5831 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5832 << "LV: Vectorization seems to be not beneficial, " 5833 << "but was forced by a user.\n"); 5834 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5835 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5836 (unsigned)(Width * Cost)}; 5837 return Factor; 5838 } 5839 5840 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5841 const Loop &L, ElementCount VF) const { 5842 // Cross iteration phis such as reductions need special handling and are 5843 // currently unsupported. 5844 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5845 return Legal->isFirstOrderRecurrence(&Phi) || 5846 Legal->isReductionVariable(&Phi); 5847 })) 5848 return false; 5849 5850 // Phis with uses outside of the loop require special handling and are 5851 // currently unsupported. 5852 for (auto &Entry : Legal->getInductionVars()) { 5853 // Look for uses of the value of the induction at the last iteration. 5854 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5855 for (User *U : PostInc->users()) 5856 if (!L.contains(cast<Instruction>(U))) 5857 return false; 5858 // Look for uses of penultimate value of the induction. 5859 for (User *U : Entry.first->users()) 5860 if (!L.contains(cast<Instruction>(U))) 5861 return false; 5862 } 5863 5864 // Induction variables that are widened require special handling that is 5865 // currently not supported. 5866 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5867 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5868 this->isProfitableToScalarize(Entry.first, VF)); 5869 })) 5870 return false; 5871 5872 return true; 5873 } 5874 5875 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5876 const ElementCount VF) const { 5877 // FIXME: We need a much better cost-model to take different parameters such 5878 // as register pressure, code size increase and cost of extra branches into 5879 // account. For now we apply a very crude heuristic and only consider loops 5880 // with vectorization factors larger than a certain value. 5881 // We also consider epilogue vectorization unprofitable for targets that don't 5882 // consider interleaving beneficial (eg. MVE). 5883 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5884 return false; 5885 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5886 return true; 5887 return false; 5888 } 5889 5890 VectorizationFactor 5891 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5892 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5893 VectorizationFactor Result = VectorizationFactor::Disabled(); 5894 if (!EnableEpilogueVectorization) { 5895 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5896 return Result; 5897 } 5898 5899 if (!isScalarEpilogueAllowed()) { 5900 LLVM_DEBUG( 5901 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5902 "allowed.\n";); 5903 return Result; 5904 } 5905 5906 // FIXME: This can be fixed for scalable vectors later, because at this stage 5907 // the LoopVectorizer will only consider vectorizing a loop with scalable 5908 // vectors when the loop has a hint to enable vectorization for a given VF. 5909 if (MainLoopVF.isScalable()) { 5910 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5911 "yet supported.\n"); 5912 return Result; 5913 } 5914 5915 // Not really a cost consideration, but check for unsupported cases here to 5916 // simplify the logic. 5917 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5918 LLVM_DEBUG( 5919 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5920 "not a supported candidate.\n";); 5921 return Result; 5922 } 5923 5924 if (EpilogueVectorizationForceVF > 1) { 5925 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5926 if (LVP.hasPlanWithVFs( 5927 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5928 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5929 else { 5930 LLVM_DEBUG( 5931 dbgs() 5932 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5933 return Result; 5934 } 5935 } 5936 5937 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5938 TheLoop->getHeader()->getParent()->hasMinSize()) { 5939 LLVM_DEBUG( 5940 dbgs() 5941 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5942 return Result; 5943 } 5944 5945 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5946 return Result; 5947 5948 for (auto &NextVF : ProfitableVFs) 5949 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5950 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5951 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5952 Result = NextVF; 5953 5954 if (Result != VectorizationFactor::Disabled()) 5955 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5956 << Result.Width.getFixedValue() << "\n";); 5957 return Result; 5958 } 5959 5960 std::pair<unsigned, unsigned> 5961 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5962 unsigned MinWidth = -1U; 5963 unsigned MaxWidth = 8; 5964 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5965 5966 // For each block. 5967 for (BasicBlock *BB : TheLoop->blocks()) { 5968 // For each instruction in the loop. 5969 for (Instruction &I : BB->instructionsWithoutDebug()) { 5970 Type *T = I.getType(); 5971 5972 // Skip ignored values. 5973 if (ValuesToIgnore.count(&I)) 5974 continue; 5975 5976 // Only examine Loads, Stores and PHINodes. 5977 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5978 continue; 5979 5980 // Examine PHI nodes that are reduction variables. Update the type to 5981 // account for the recurrence type. 5982 if (auto *PN = dyn_cast<PHINode>(&I)) { 5983 if (!Legal->isReductionVariable(PN)) 5984 continue; 5985 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5986 T = RdxDesc.getRecurrenceType(); 5987 } 5988 5989 // Examine the stored values. 5990 if (auto *ST = dyn_cast<StoreInst>(&I)) 5991 T = ST->getValueOperand()->getType(); 5992 5993 // Ignore loaded pointer types and stored pointer types that are not 5994 // vectorizable. 5995 // 5996 // FIXME: The check here attempts to predict whether a load or store will 5997 // be vectorized. We only know this for certain after a VF has 5998 // been selected. Here, we assume that if an access can be 5999 // vectorized, it will be. We should also look at extending this 6000 // optimization to non-pointer types. 6001 // 6002 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6003 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6004 continue; 6005 6006 MinWidth = std::min(MinWidth, 6007 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6008 MaxWidth = std::max(MaxWidth, 6009 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6010 } 6011 } 6012 6013 return {MinWidth, MaxWidth}; 6014 } 6015 6016 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6017 unsigned LoopCost) { 6018 // -- The interleave heuristics -- 6019 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6020 // There are many micro-architectural considerations that we can't predict 6021 // at this level. For example, frontend pressure (on decode or fetch) due to 6022 // code size, or the number and capabilities of the execution ports. 6023 // 6024 // We use the following heuristics to select the interleave count: 6025 // 1. If the code has reductions, then we interleave to break the cross 6026 // iteration dependency. 6027 // 2. If the loop is really small, then we interleave to reduce the loop 6028 // overhead. 6029 // 3. We don't interleave if we think that we will spill registers to memory 6030 // due to the increased register pressure. 6031 6032 if (!isScalarEpilogueAllowed()) 6033 return 1; 6034 6035 // We used the distance for the interleave count. 6036 if (Legal->getMaxSafeDepDistBytes() != -1U) 6037 return 1; 6038 6039 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6040 const bool HasReductions = !Legal->getReductionVars().empty(); 6041 // Do not interleave loops with a relatively small known or estimated trip 6042 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6043 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6044 // because with the above conditions interleaving can expose ILP and break 6045 // cross iteration dependences for reductions. 6046 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6047 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6048 return 1; 6049 6050 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6051 // We divide by these constants so assume that we have at least one 6052 // instruction that uses at least one register. 6053 for (auto& pair : R.MaxLocalUsers) { 6054 pair.second = std::max(pair.second, 1U); 6055 } 6056 6057 // We calculate the interleave count using the following formula. 6058 // Subtract the number of loop invariants from the number of available 6059 // registers. These registers are used by all of the interleaved instances. 6060 // Next, divide the remaining registers by the number of registers that is 6061 // required by the loop, in order to estimate how many parallel instances 6062 // fit without causing spills. All of this is rounded down if necessary to be 6063 // a power of two. We want power of two interleave count to simplify any 6064 // addressing operations or alignment considerations. 6065 // We also want power of two interleave counts to ensure that the induction 6066 // variable of the vector loop wraps to zero, when tail is folded by masking; 6067 // this currently happens when OptForSize, in which case IC is set to 1 above. 6068 unsigned IC = UINT_MAX; 6069 6070 for (auto& pair : R.MaxLocalUsers) { 6071 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6072 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6073 << " registers of " 6074 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6075 if (VF.isScalar()) { 6076 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6077 TargetNumRegisters = ForceTargetNumScalarRegs; 6078 } else { 6079 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6080 TargetNumRegisters = ForceTargetNumVectorRegs; 6081 } 6082 unsigned MaxLocalUsers = pair.second; 6083 unsigned LoopInvariantRegs = 0; 6084 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6085 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6086 6087 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6088 // Don't count the induction variable as interleaved. 6089 if (EnableIndVarRegisterHeur) { 6090 TmpIC = 6091 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6092 std::max(1U, (MaxLocalUsers - 1))); 6093 } 6094 6095 IC = std::min(IC, TmpIC); 6096 } 6097 6098 // Clamp the interleave ranges to reasonable counts. 6099 unsigned MaxInterleaveCount = 6100 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6101 6102 // Check if the user has overridden the max. 6103 if (VF.isScalar()) { 6104 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6105 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6106 } else { 6107 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6108 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6109 } 6110 6111 // If trip count is known or estimated compile time constant, limit the 6112 // interleave count to be less than the trip count divided by VF, provided it 6113 // is at least 1. 6114 // 6115 // For scalable vectors we can't know if interleaving is beneficial. It may 6116 // not be beneficial for small loops if none of the lanes in the second vector 6117 // iterations is enabled. However, for larger loops, there is likely to be a 6118 // similar benefit as for fixed-width vectors. For now, we choose to leave 6119 // the InterleaveCount as if vscale is '1', although if some information about 6120 // the vector is known (e.g. min vector size), we can make a better decision. 6121 if (BestKnownTC) { 6122 MaxInterleaveCount = 6123 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6124 // Make sure MaxInterleaveCount is greater than 0. 6125 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6126 } 6127 6128 assert(MaxInterleaveCount > 0 && 6129 "Maximum interleave count must be greater than 0"); 6130 6131 // Clamp the calculated IC to be between the 1 and the max interleave count 6132 // that the target and trip count allows. 6133 if (IC > MaxInterleaveCount) 6134 IC = MaxInterleaveCount; 6135 else 6136 // Make sure IC is greater than 0. 6137 IC = std::max(1u, IC); 6138 6139 assert(IC > 0 && "Interleave count must be greater than 0."); 6140 6141 // If we did not calculate the cost for VF (because the user selected the VF) 6142 // then we calculate the cost of VF here. 6143 if (LoopCost == 0) { 6144 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6145 LoopCost = *expectedCost(VF).first.getValue(); 6146 } 6147 6148 assert(LoopCost && "Non-zero loop cost expected"); 6149 6150 // Interleave if we vectorized this loop and there is a reduction that could 6151 // benefit from interleaving. 6152 if (VF.isVector() && HasReductions) { 6153 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6154 return IC; 6155 } 6156 6157 // Note that if we've already vectorized the loop we will have done the 6158 // runtime check and so interleaving won't require further checks. 6159 bool InterleavingRequiresRuntimePointerCheck = 6160 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6161 6162 // We want to interleave small loops in order to reduce the loop overhead and 6163 // potentially expose ILP opportunities. 6164 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6165 << "LV: IC is " << IC << '\n' 6166 << "LV: VF is " << VF << '\n'); 6167 const bool AggressivelyInterleaveReductions = 6168 TTI.enableAggressiveInterleaving(HasReductions); 6169 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6170 // We assume that the cost overhead is 1 and we use the cost model 6171 // to estimate the cost of the loop and interleave until the cost of the 6172 // loop overhead is about 5% of the cost of the loop. 6173 unsigned SmallIC = 6174 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6175 6176 // Interleave until store/load ports (estimated by max interleave count) are 6177 // saturated. 6178 unsigned NumStores = Legal->getNumStores(); 6179 unsigned NumLoads = Legal->getNumLoads(); 6180 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6181 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6182 6183 // If we have a scalar reduction (vector reductions are already dealt with 6184 // by this point), we can increase the critical path length if the loop 6185 // we're interleaving is inside another loop. Limit, by default to 2, so the 6186 // critical path only gets increased by one reduction operation. 6187 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6188 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6189 SmallIC = std::min(SmallIC, F); 6190 StoresIC = std::min(StoresIC, F); 6191 LoadsIC = std::min(LoadsIC, F); 6192 } 6193 6194 if (EnableLoadStoreRuntimeInterleave && 6195 std::max(StoresIC, LoadsIC) > SmallIC) { 6196 LLVM_DEBUG( 6197 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6198 return std::max(StoresIC, LoadsIC); 6199 } 6200 6201 // If there are scalar reductions and TTI has enabled aggressive 6202 // interleaving for reductions, we will interleave to expose ILP. 6203 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6204 AggressivelyInterleaveReductions) { 6205 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6206 // Interleave no less than SmallIC but not as aggressive as the normal IC 6207 // to satisfy the rare situation when resources are too limited. 6208 return std::max(IC / 2, SmallIC); 6209 } else { 6210 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6211 return SmallIC; 6212 } 6213 } 6214 6215 // Interleave if this is a large loop (small loops are already dealt with by 6216 // this point) that could benefit from interleaving. 6217 if (AggressivelyInterleaveReductions) { 6218 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6219 return IC; 6220 } 6221 6222 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6223 return 1; 6224 } 6225 6226 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6227 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6228 // This function calculates the register usage by measuring the highest number 6229 // of values that are alive at a single location. Obviously, this is a very 6230 // rough estimation. We scan the loop in a topological order in order and 6231 // assign a number to each instruction. We use RPO to ensure that defs are 6232 // met before their users. We assume that each instruction that has in-loop 6233 // users starts an interval. We record every time that an in-loop value is 6234 // used, so we have a list of the first and last occurrences of each 6235 // instruction. Next, we transpose this data structure into a multi map that 6236 // holds the list of intervals that *end* at a specific location. This multi 6237 // map allows us to perform a linear search. We scan the instructions linearly 6238 // and record each time that a new interval starts, by placing it in a set. 6239 // If we find this value in the multi-map then we remove it from the set. 6240 // The max register usage is the maximum size of the set. 6241 // We also search for instructions that are defined outside the loop, but are 6242 // used inside the loop. We need this number separately from the max-interval 6243 // usage number because when we unroll, loop-invariant values do not take 6244 // more register. 6245 LoopBlocksDFS DFS(TheLoop); 6246 DFS.perform(LI); 6247 6248 RegisterUsage RU; 6249 6250 // Each 'key' in the map opens a new interval. The values 6251 // of the map are the index of the 'last seen' usage of the 6252 // instruction that is the key. 6253 using IntervalMap = DenseMap<Instruction *, unsigned>; 6254 6255 // Maps instruction to its index. 6256 SmallVector<Instruction *, 64> IdxToInstr; 6257 // Marks the end of each interval. 6258 IntervalMap EndPoint; 6259 // Saves the list of instruction indices that are used in the loop. 6260 SmallPtrSet<Instruction *, 8> Ends; 6261 // Saves the list of values that are used in the loop but are 6262 // defined outside the loop, such as arguments and constants. 6263 SmallPtrSet<Value *, 8> LoopInvariants; 6264 6265 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6266 for (Instruction &I : BB->instructionsWithoutDebug()) { 6267 IdxToInstr.push_back(&I); 6268 6269 // Save the end location of each USE. 6270 for (Value *U : I.operands()) { 6271 auto *Instr = dyn_cast<Instruction>(U); 6272 6273 // Ignore non-instruction values such as arguments, constants, etc. 6274 if (!Instr) 6275 continue; 6276 6277 // If this instruction is outside the loop then record it and continue. 6278 if (!TheLoop->contains(Instr)) { 6279 LoopInvariants.insert(Instr); 6280 continue; 6281 } 6282 6283 // Overwrite previous end points. 6284 EndPoint[Instr] = IdxToInstr.size(); 6285 Ends.insert(Instr); 6286 } 6287 } 6288 } 6289 6290 // Saves the list of intervals that end with the index in 'key'. 6291 using InstrList = SmallVector<Instruction *, 2>; 6292 DenseMap<unsigned, InstrList> TransposeEnds; 6293 6294 // Transpose the EndPoints to a list of values that end at each index. 6295 for (auto &Interval : EndPoint) 6296 TransposeEnds[Interval.second].push_back(Interval.first); 6297 6298 SmallPtrSet<Instruction *, 8> OpenIntervals; 6299 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6300 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6301 6302 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6303 6304 // A lambda that gets the register usage for the given type and VF. 6305 const auto &TTICapture = TTI; 6306 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6307 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6308 return 0U; 6309 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6310 }; 6311 6312 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6313 Instruction *I = IdxToInstr[i]; 6314 6315 // Remove all of the instructions that end at this location. 6316 InstrList &List = TransposeEnds[i]; 6317 for (Instruction *ToRemove : List) 6318 OpenIntervals.erase(ToRemove); 6319 6320 // Ignore instructions that are never used within the loop. 6321 if (!Ends.count(I)) 6322 continue; 6323 6324 // Skip ignored values. 6325 if (ValuesToIgnore.count(I)) 6326 continue; 6327 6328 // For each VF find the maximum usage of registers. 6329 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6330 // Count the number of live intervals. 6331 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6332 6333 if (VFs[j].isScalar()) { 6334 for (auto Inst : OpenIntervals) { 6335 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6336 if (RegUsage.find(ClassID) == RegUsage.end()) 6337 RegUsage[ClassID] = 1; 6338 else 6339 RegUsage[ClassID] += 1; 6340 } 6341 } else { 6342 collectUniformsAndScalars(VFs[j]); 6343 for (auto Inst : OpenIntervals) { 6344 // Skip ignored values for VF > 1. 6345 if (VecValuesToIgnore.count(Inst)) 6346 continue; 6347 if (isScalarAfterVectorization(Inst, VFs[j])) { 6348 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6349 if (RegUsage.find(ClassID) == RegUsage.end()) 6350 RegUsage[ClassID] = 1; 6351 else 6352 RegUsage[ClassID] += 1; 6353 } else { 6354 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6355 if (RegUsage.find(ClassID) == RegUsage.end()) 6356 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6357 else 6358 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6359 } 6360 } 6361 } 6362 6363 for (auto& pair : RegUsage) { 6364 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6365 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6366 else 6367 MaxUsages[j][pair.first] = pair.second; 6368 } 6369 } 6370 6371 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6372 << OpenIntervals.size() << '\n'); 6373 6374 // Add the current instruction to the list of open intervals. 6375 OpenIntervals.insert(I); 6376 } 6377 6378 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6379 SmallMapVector<unsigned, unsigned, 4> Invariant; 6380 6381 for (auto Inst : LoopInvariants) { 6382 unsigned Usage = 6383 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6384 unsigned ClassID = 6385 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6386 if (Invariant.find(ClassID) == Invariant.end()) 6387 Invariant[ClassID] = Usage; 6388 else 6389 Invariant[ClassID] += Usage; 6390 } 6391 6392 LLVM_DEBUG({ 6393 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6394 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6395 << " item\n"; 6396 for (const auto &pair : MaxUsages[i]) { 6397 dbgs() << "LV(REG): RegisterClass: " 6398 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6399 << " registers\n"; 6400 } 6401 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6402 << " item\n"; 6403 for (const auto &pair : Invariant) { 6404 dbgs() << "LV(REG): RegisterClass: " 6405 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6406 << " registers\n"; 6407 } 6408 }); 6409 6410 RU.LoopInvariantRegs = Invariant; 6411 RU.MaxLocalUsers = MaxUsages[i]; 6412 RUs[i] = RU; 6413 } 6414 6415 return RUs; 6416 } 6417 6418 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6419 // TODO: Cost model for emulated masked load/store is completely 6420 // broken. This hack guides the cost model to use an artificially 6421 // high enough value to practically disable vectorization with such 6422 // operations, except where previously deployed legality hack allowed 6423 // using very low cost values. This is to avoid regressions coming simply 6424 // from moving "masked load/store" check from legality to cost model. 6425 // Masked Load/Gather emulation was previously never allowed. 6426 // Limited number of Masked Store/Scatter emulation was allowed. 6427 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6428 return isa<LoadInst>(I) || 6429 (isa<StoreInst>(I) && 6430 NumPredStores > NumberOfStoresToPredicate); 6431 } 6432 6433 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6434 // If we aren't vectorizing the loop, or if we've already collected the 6435 // instructions to scalarize, there's nothing to do. Collection may already 6436 // have occurred if we have a user-selected VF and are now computing the 6437 // expected cost for interleaving. 6438 if (VF.isScalar() || VF.isZero() || 6439 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6440 return; 6441 6442 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6443 // not profitable to scalarize any instructions, the presence of VF in the 6444 // map will indicate that we've analyzed it already. 6445 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6446 6447 // Find all the instructions that are scalar with predication in the loop and 6448 // determine if it would be better to not if-convert the blocks they are in. 6449 // If so, we also record the instructions to scalarize. 6450 for (BasicBlock *BB : TheLoop->blocks()) { 6451 if (!blockNeedsPredication(BB)) 6452 continue; 6453 for (Instruction &I : *BB) 6454 if (isScalarWithPredication(&I)) { 6455 ScalarCostsTy ScalarCosts; 6456 // Do not apply discount logic if hacked cost is needed 6457 // for emulated masked memrefs. 6458 if (!useEmulatedMaskMemRefHack(&I) && 6459 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6460 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6461 // Remember that BB will remain after vectorization. 6462 PredicatedBBsAfterVectorization.insert(BB); 6463 } 6464 } 6465 } 6466 6467 int LoopVectorizationCostModel::computePredInstDiscount( 6468 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6469 assert(!isUniformAfterVectorization(PredInst, VF) && 6470 "Instruction marked uniform-after-vectorization will be predicated"); 6471 6472 // Initialize the discount to zero, meaning that the scalar version and the 6473 // vector version cost the same. 6474 InstructionCost Discount = 0; 6475 6476 // Holds instructions to analyze. The instructions we visit are mapped in 6477 // ScalarCosts. Those instructions are the ones that would be scalarized if 6478 // we find that the scalar version costs less. 6479 SmallVector<Instruction *, 8> Worklist; 6480 6481 // Returns true if the given instruction can be scalarized. 6482 auto canBeScalarized = [&](Instruction *I) -> bool { 6483 // We only attempt to scalarize instructions forming a single-use chain 6484 // from the original predicated block that would otherwise be vectorized. 6485 // Although not strictly necessary, we give up on instructions we know will 6486 // already be scalar to avoid traversing chains that are unlikely to be 6487 // beneficial. 6488 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6489 isScalarAfterVectorization(I, VF)) 6490 return false; 6491 6492 // If the instruction is scalar with predication, it will be analyzed 6493 // separately. We ignore it within the context of PredInst. 6494 if (isScalarWithPredication(I)) 6495 return false; 6496 6497 // If any of the instruction's operands are uniform after vectorization, 6498 // the instruction cannot be scalarized. This prevents, for example, a 6499 // masked load from being scalarized. 6500 // 6501 // We assume we will only emit a value for lane zero of an instruction 6502 // marked uniform after vectorization, rather than VF identical values. 6503 // Thus, if we scalarize an instruction that uses a uniform, we would 6504 // create uses of values corresponding to the lanes we aren't emitting code 6505 // for. This behavior can be changed by allowing getScalarValue to clone 6506 // the lane zero values for uniforms rather than asserting. 6507 for (Use &U : I->operands()) 6508 if (auto *J = dyn_cast<Instruction>(U.get())) 6509 if (isUniformAfterVectorization(J, VF)) 6510 return false; 6511 6512 // Otherwise, we can scalarize the instruction. 6513 return true; 6514 }; 6515 6516 // Compute the expected cost discount from scalarizing the entire expression 6517 // feeding the predicated instruction. We currently only consider expressions 6518 // that are single-use instruction chains. 6519 Worklist.push_back(PredInst); 6520 while (!Worklist.empty()) { 6521 Instruction *I = Worklist.pop_back_val(); 6522 6523 // If we've already analyzed the instruction, there's nothing to do. 6524 if (ScalarCosts.find(I) != ScalarCosts.end()) 6525 continue; 6526 6527 // Compute the cost of the vector instruction. Note that this cost already 6528 // includes the scalarization overhead of the predicated instruction. 6529 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6530 6531 // Compute the cost of the scalarized instruction. This cost is the cost of 6532 // the instruction as if it wasn't if-converted and instead remained in the 6533 // predicated block. We will scale this cost by block probability after 6534 // computing the scalarization overhead. 6535 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6536 InstructionCost ScalarCost = 6537 VF.getKnownMinValue() * 6538 getInstructionCost(I, ElementCount::getFixed(1)).first; 6539 6540 // Compute the scalarization overhead of needed insertelement instructions 6541 // and phi nodes. 6542 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6543 ScalarCost += TTI.getScalarizationOverhead( 6544 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6545 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6546 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6547 ScalarCost += 6548 VF.getKnownMinValue() * 6549 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6550 } 6551 6552 // Compute the scalarization overhead of needed extractelement 6553 // instructions. For each of the instruction's operands, if the operand can 6554 // be scalarized, add it to the worklist; otherwise, account for the 6555 // overhead. 6556 for (Use &U : I->operands()) 6557 if (auto *J = dyn_cast<Instruction>(U.get())) { 6558 assert(VectorType::isValidElementType(J->getType()) && 6559 "Instruction has non-scalar type"); 6560 if (canBeScalarized(J)) 6561 Worklist.push_back(J); 6562 else if (needsExtract(J, VF)) { 6563 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6564 ScalarCost += TTI.getScalarizationOverhead( 6565 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6566 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6567 } 6568 } 6569 6570 // Scale the total scalar cost by block probability. 6571 ScalarCost /= getReciprocalPredBlockProb(); 6572 6573 // Compute the discount. A non-negative discount means the vector version 6574 // of the instruction costs more, and scalarizing would be beneficial. 6575 Discount += VectorCost - ScalarCost; 6576 ScalarCosts[I] = ScalarCost; 6577 } 6578 6579 return *Discount.getValue(); 6580 } 6581 6582 LoopVectorizationCostModel::VectorizationCostTy 6583 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6584 VectorizationCostTy Cost; 6585 6586 // For each block. 6587 for (BasicBlock *BB : TheLoop->blocks()) { 6588 VectorizationCostTy BlockCost; 6589 6590 // For each instruction in the old loop. 6591 for (Instruction &I : BB->instructionsWithoutDebug()) { 6592 // Skip ignored values. 6593 if (ValuesToIgnore.count(&I) || 6594 (VF.isVector() && VecValuesToIgnore.count(&I))) 6595 continue; 6596 6597 VectorizationCostTy C = getInstructionCost(&I, VF); 6598 6599 // Check if we should override the cost. 6600 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6601 C.first = InstructionCost(ForceTargetInstructionCost); 6602 6603 BlockCost.first += C.first; 6604 BlockCost.second |= C.second; 6605 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6606 << " for VF " << VF << " For instruction: " << I 6607 << '\n'); 6608 } 6609 6610 // If we are vectorizing a predicated block, it will have been 6611 // if-converted. This means that the block's instructions (aside from 6612 // stores and instructions that may divide by zero) will now be 6613 // unconditionally executed. For the scalar case, we may not always execute 6614 // the predicated block, if it is an if-else block. Thus, scale the block's 6615 // cost by the probability of executing it. blockNeedsPredication from 6616 // Legal is used so as to not include all blocks in tail folded loops. 6617 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6618 BlockCost.first /= getReciprocalPredBlockProb(); 6619 6620 Cost.first += BlockCost.first; 6621 Cost.second |= BlockCost.second; 6622 } 6623 6624 return Cost; 6625 } 6626 6627 /// Gets Address Access SCEV after verifying that the access pattern 6628 /// is loop invariant except the induction variable dependence. 6629 /// 6630 /// This SCEV can be sent to the Target in order to estimate the address 6631 /// calculation cost. 6632 static const SCEV *getAddressAccessSCEV( 6633 Value *Ptr, 6634 LoopVectorizationLegality *Legal, 6635 PredicatedScalarEvolution &PSE, 6636 const Loop *TheLoop) { 6637 6638 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6639 if (!Gep) 6640 return nullptr; 6641 6642 // We are looking for a gep with all loop invariant indices except for one 6643 // which should be an induction variable. 6644 auto SE = PSE.getSE(); 6645 unsigned NumOperands = Gep->getNumOperands(); 6646 for (unsigned i = 1; i < NumOperands; ++i) { 6647 Value *Opd = Gep->getOperand(i); 6648 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6649 !Legal->isInductionVariable(Opd)) 6650 return nullptr; 6651 } 6652 6653 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6654 return PSE.getSCEV(Ptr); 6655 } 6656 6657 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6658 return Legal->hasStride(I->getOperand(0)) || 6659 Legal->hasStride(I->getOperand(1)); 6660 } 6661 6662 InstructionCost 6663 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6664 ElementCount VF) { 6665 assert(VF.isVector() && 6666 "Scalarization cost of instruction implies vectorization."); 6667 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6668 Type *ValTy = getMemInstValueType(I); 6669 auto SE = PSE.getSE(); 6670 6671 unsigned AS = getLoadStoreAddressSpace(I); 6672 Value *Ptr = getLoadStorePointerOperand(I); 6673 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6674 6675 // Figure out whether the access is strided and get the stride value 6676 // if it's known in compile time 6677 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6678 6679 // Get the cost of the scalar memory instruction and address computation. 6680 InstructionCost Cost = 6681 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6682 6683 // Don't pass *I here, since it is scalar but will actually be part of a 6684 // vectorized loop where the user of it is a vectorized instruction. 6685 const Align Alignment = getLoadStoreAlignment(I); 6686 Cost += VF.getKnownMinValue() * 6687 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6688 AS, TTI::TCK_RecipThroughput); 6689 6690 // Get the overhead of the extractelement and insertelement instructions 6691 // we might create due to scalarization. 6692 Cost += getScalarizationOverhead(I, VF); 6693 6694 // If we have a predicated store, it may not be executed for each vector 6695 // lane. Scale the cost by the probability of executing the predicated 6696 // block. 6697 if (isPredicatedInst(I)) { 6698 Cost /= getReciprocalPredBlockProb(); 6699 6700 if (useEmulatedMaskMemRefHack(I)) 6701 // Artificially setting to a high enough value to practically disable 6702 // vectorization with such operations. 6703 Cost = 3000000; 6704 } 6705 6706 return Cost; 6707 } 6708 6709 InstructionCost 6710 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6711 ElementCount VF) { 6712 Type *ValTy = getMemInstValueType(I); 6713 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6714 Value *Ptr = getLoadStorePointerOperand(I); 6715 unsigned AS = getLoadStoreAddressSpace(I); 6716 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6717 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6718 6719 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6720 "Stride should be 1 or -1 for consecutive memory access"); 6721 const Align Alignment = getLoadStoreAlignment(I); 6722 InstructionCost Cost = 0; 6723 if (Legal->isMaskRequired(I)) 6724 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6725 CostKind); 6726 else 6727 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6728 CostKind, I); 6729 6730 bool Reverse = ConsecutiveStride < 0; 6731 if (Reverse) 6732 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6733 return Cost; 6734 } 6735 6736 InstructionCost 6737 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6738 ElementCount VF) { 6739 assert(Legal->isUniformMemOp(*I)); 6740 6741 Type *ValTy = getMemInstValueType(I); 6742 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6743 const Align Alignment = getLoadStoreAlignment(I); 6744 unsigned AS = getLoadStoreAddressSpace(I); 6745 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6746 if (isa<LoadInst>(I)) { 6747 return TTI.getAddressComputationCost(ValTy) + 6748 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6749 CostKind) + 6750 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6751 } 6752 StoreInst *SI = cast<StoreInst>(I); 6753 6754 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6755 return TTI.getAddressComputationCost(ValTy) + 6756 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6757 CostKind) + 6758 (isLoopInvariantStoreValue 6759 ? 0 6760 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6761 VF.getKnownMinValue() - 1)); 6762 } 6763 6764 InstructionCost 6765 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6766 ElementCount VF) { 6767 Type *ValTy = getMemInstValueType(I); 6768 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6769 const Align Alignment = getLoadStoreAlignment(I); 6770 const Value *Ptr = getLoadStorePointerOperand(I); 6771 6772 return TTI.getAddressComputationCost(VectorTy) + 6773 TTI.getGatherScatterOpCost( 6774 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6775 TargetTransformInfo::TCK_RecipThroughput, I); 6776 } 6777 6778 InstructionCost 6779 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6780 ElementCount VF) { 6781 Type *ValTy = getMemInstValueType(I); 6782 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6783 unsigned AS = getLoadStoreAddressSpace(I); 6784 6785 auto Group = getInterleavedAccessGroup(I); 6786 assert(Group && "Fail to get an interleaved access group."); 6787 6788 unsigned InterleaveFactor = Group->getFactor(); 6789 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6790 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6791 6792 // Holds the indices of existing members in an interleaved load group. 6793 // An interleaved store group doesn't need this as it doesn't allow gaps. 6794 SmallVector<unsigned, 4> Indices; 6795 if (isa<LoadInst>(I)) { 6796 for (unsigned i = 0; i < InterleaveFactor; i++) 6797 if (Group->getMember(i)) 6798 Indices.push_back(i); 6799 } 6800 6801 // Calculate the cost of the whole interleaved group. 6802 bool UseMaskForGaps = 6803 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6804 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6805 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6806 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6807 6808 if (Group->isReverse()) { 6809 // TODO: Add support for reversed masked interleaved access. 6810 assert(!Legal->isMaskRequired(I) && 6811 "Reverse masked interleaved access not supported."); 6812 Cost += Group->getNumMembers() * 6813 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6814 } 6815 return Cost; 6816 } 6817 6818 InstructionCost 6819 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6820 ElementCount VF) { 6821 // Calculate scalar cost only. Vectorization cost should be ready at this 6822 // moment. 6823 if (VF.isScalar()) { 6824 Type *ValTy = getMemInstValueType(I); 6825 const Align Alignment = getLoadStoreAlignment(I); 6826 unsigned AS = getLoadStoreAddressSpace(I); 6827 6828 return TTI.getAddressComputationCost(ValTy) + 6829 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6830 TTI::TCK_RecipThroughput, I); 6831 } 6832 return getWideningCost(I, VF); 6833 } 6834 6835 LoopVectorizationCostModel::VectorizationCostTy 6836 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6837 ElementCount VF) { 6838 // If we know that this instruction will remain uniform, check the cost of 6839 // the scalar version. 6840 if (isUniformAfterVectorization(I, VF)) 6841 VF = ElementCount::getFixed(1); 6842 6843 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6844 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6845 6846 // Forced scalars do not have any scalarization overhead. 6847 auto ForcedScalar = ForcedScalars.find(VF); 6848 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6849 auto InstSet = ForcedScalar->second; 6850 if (InstSet.count(I)) 6851 return VectorizationCostTy( 6852 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6853 VF.getKnownMinValue()), 6854 false); 6855 } 6856 6857 Type *VectorTy; 6858 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6859 6860 bool TypeNotScalarized = 6861 VF.isVector() && VectorTy->isVectorTy() && 6862 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6863 return VectorizationCostTy(C, TypeNotScalarized); 6864 } 6865 6866 InstructionCost 6867 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6868 ElementCount VF) { 6869 6870 assert(!VF.isScalable() && 6871 "cannot compute scalarization overhead for scalable vectorization"); 6872 if (VF.isScalar()) 6873 return 0; 6874 6875 InstructionCost Cost = 0; 6876 Type *RetTy = ToVectorTy(I->getType(), VF); 6877 if (!RetTy->isVoidTy() && 6878 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6879 Cost += TTI.getScalarizationOverhead( 6880 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6881 true, false); 6882 6883 // Some targets keep addresses scalar. 6884 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6885 return Cost; 6886 6887 // Some targets support efficient element stores. 6888 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6889 return Cost; 6890 6891 // Collect operands to consider. 6892 CallInst *CI = dyn_cast<CallInst>(I); 6893 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6894 6895 // Skip operands that do not require extraction/scalarization and do not incur 6896 // any overhead. 6897 return Cost + TTI.getOperandsScalarizationOverhead( 6898 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6899 } 6900 6901 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6902 if (VF.isScalar()) 6903 return; 6904 NumPredStores = 0; 6905 for (BasicBlock *BB : TheLoop->blocks()) { 6906 // For each instruction in the old loop. 6907 for (Instruction &I : *BB) { 6908 Value *Ptr = getLoadStorePointerOperand(&I); 6909 if (!Ptr) 6910 continue; 6911 6912 // TODO: We should generate better code and update the cost model for 6913 // predicated uniform stores. Today they are treated as any other 6914 // predicated store (see added test cases in 6915 // invariant-store-vectorization.ll). 6916 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6917 NumPredStores++; 6918 6919 if (Legal->isUniformMemOp(I)) { 6920 // TODO: Avoid replicating loads and stores instead of 6921 // relying on instcombine to remove them. 6922 // Load: Scalar load + broadcast 6923 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6924 InstructionCost Cost = getUniformMemOpCost(&I, VF); 6925 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6926 continue; 6927 } 6928 6929 // We assume that widening is the best solution when possible. 6930 if (memoryInstructionCanBeWidened(&I, VF)) { 6931 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6932 int ConsecutiveStride = 6933 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6934 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6935 "Expected consecutive stride."); 6936 InstWidening Decision = 6937 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6938 setWideningDecision(&I, VF, Decision, Cost); 6939 continue; 6940 } 6941 6942 // Choose between Interleaving, Gather/Scatter or Scalarization. 6943 InstructionCost InterleaveCost = std::numeric_limits<int>::max(); 6944 unsigned NumAccesses = 1; 6945 if (isAccessInterleaved(&I)) { 6946 auto Group = getInterleavedAccessGroup(&I); 6947 assert(Group && "Fail to get an interleaved access group."); 6948 6949 // Make one decision for the whole group. 6950 if (getWideningDecision(&I, VF) != CM_Unknown) 6951 continue; 6952 6953 NumAccesses = Group->getNumMembers(); 6954 if (interleavedAccessCanBeWidened(&I, VF)) 6955 InterleaveCost = getInterleaveGroupCost(&I, VF); 6956 } 6957 6958 InstructionCost GatherScatterCost = 6959 isLegalGatherOrScatter(&I) 6960 ? getGatherScatterCost(&I, VF) * NumAccesses 6961 : std::numeric_limits<int>::max(); 6962 6963 InstructionCost ScalarizationCost = 6964 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6965 6966 // Choose better solution for the current VF, 6967 // write down this decision and use it during vectorization. 6968 InstructionCost Cost; 6969 InstWidening Decision; 6970 if (InterleaveCost <= GatherScatterCost && 6971 InterleaveCost < ScalarizationCost) { 6972 Decision = CM_Interleave; 6973 Cost = InterleaveCost; 6974 } else if (GatherScatterCost < ScalarizationCost) { 6975 Decision = CM_GatherScatter; 6976 Cost = GatherScatterCost; 6977 } else { 6978 Decision = CM_Scalarize; 6979 Cost = ScalarizationCost; 6980 } 6981 // If the instructions belongs to an interleave group, the whole group 6982 // receives the same decision. The whole group receives the cost, but 6983 // the cost will actually be assigned to one instruction. 6984 if (auto Group = getInterleavedAccessGroup(&I)) 6985 setWideningDecision(Group, VF, Decision, Cost); 6986 else 6987 setWideningDecision(&I, VF, Decision, Cost); 6988 } 6989 } 6990 6991 // Make sure that any load of address and any other address computation 6992 // remains scalar unless there is gather/scatter support. This avoids 6993 // inevitable extracts into address registers, and also has the benefit of 6994 // activating LSR more, since that pass can't optimize vectorized 6995 // addresses. 6996 if (TTI.prefersVectorizedAddressing()) 6997 return; 6998 6999 // Start with all scalar pointer uses. 7000 SmallPtrSet<Instruction *, 8> AddrDefs; 7001 for (BasicBlock *BB : TheLoop->blocks()) 7002 for (Instruction &I : *BB) { 7003 Instruction *PtrDef = 7004 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7005 if (PtrDef && TheLoop->contains(PtrDef) && 7006 getWideningDecision(&I, VF) != CM_GatherScatter) 7007 AddrDefs.insert(PtrDef); 7008 } 7009 7010 // Add all instructions used to generate the addresses. 7011 SmallVector<Instruction *, 4> Worklist; 7012 append_range(Worklist, AddrDefs); 7013 while (!Worklist.empty()) { 7014 Instruction *I = Worklist.pop_back_val(); 7015 for (auto &Op : I->operands()) 7016 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7017 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7018 AddrDefs.insert(InstOp).second) 7019 Worklist.push_back(InstOp); 7020 } 7021 7022 for (auto *I : AddrDefs) { 7023 if (isa<LoadInst>(I)) { 7024 // Setting the desired widening decision should ideally be handled in 7025 // by cost functions, but since this involves the task of finding out 7026 // if the loaded register is involved in an address computation, it is 7027 // instead changed here when we know this is the case. 7028 InstWidening Decision = getWideningDecision(I, VF); 7029 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7030 // Scalarize a widened load of address. 7031 setWideningDecision( 7032 I, VF, CM_Scalarize, 7033 (VF.getKnownMinValue() * 7034 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7035 else if (auto Group = getInterleavedAccessGroup(I)) { 7036 // Scalarize an interleave group of address loads. 7037 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7038 if (Instruction *Member = Group->getMember(I)) 7039 setWideningDecision( 7040 Member, VF, CM_Scalarize, 7041 (VF.getKnownMinValue() * 7042 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7043 } 7044 } 7045 } else 7046 // Make sure I gets scalarized and a cost estimate without 7047 // scalarization overhead. 7048 ForcedScalars[VF].insert(I); 7049 } 7050 } 7051 7052 InstructionCost 7053 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7054 Type *&VectorTy) { 7055 Type *RetTy = I->getType(); 7056 if (canTruncateToMinimalBitwidth(I, VF)) 7057 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7058 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7059 auto SE = PSE.getSE(); 7060 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7061 7062 // TODO: We need to estimate the cost of intrinsic calls. 7063 switch (I->getOpcode()) { 7064 case Instruction::GetElementPtr: 7065 // We mark this instruction as zero-cost because the cost of GEPs in 7066 // vectorized code depends on whether the corresponding memory instruction 7067 // is scalarized or not. Therefore, we handle GEPs with the memory 7068 // instruction cost. 7069 return 0; 7070 case Instruction::Br: { 7071 // In cases of scalarized and predicated instructions, there will be VF 7072 // predicated blocks in the vectorized loop. Each branch around these 7073 // blocks requires also an extract of its vector compare i1 element. 7074 bool ScalarPredicatedBB = false; 7075 BranchInst *BI = cast<BranchInst>(I); 7076 if (VF.isVector() && BI->isConditional() && 7077 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7078 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7079 ScalarPredicatedBB = true; 7080 7081 if (ScalarPredicatedBB) { 7082 // Return cost for branches around scalarized and predicated blocks. 7083 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7084 auto *Vec_i1Ty = 7085 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7086 return (TTI.getScalarizationOverhead( 7087 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7088 false, true) + 7089 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7090 VF.getKnownMinValue())); 7091 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7092 // The back-edge branch will remain, as will all scalar branches. 7093 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7094 else 7095 // This branch will be eliminated by if-conversion. 7096 return 0; 7097 // Note: We currently assume zero cost for an unconditional branch inside 7098 // a predicated block since it will become a fall-through, although we 7099 // may decide in the future to call TTI for all branches. 7100 } 7101 case Instruction::PHI: { 7102 auto *Phi = cast<PHINode>(I); 7103 7104 // First-order recurrences are replaced by vector shuffles inside the loop. 7105 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7106 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7107 return TTI.getShuffleCost( 7108 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7109 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7110 7111 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7112 // converted into select instructions. We require N - 1 selects per phi 7113 // node, where N is the number of incoming values. 7114 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7115 return (Phi->getNumIncomingValues() - 1) * 7116 TTI.getCmpSelInstrCost( 7117 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7118 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7119 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7120 7121 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7122 } 7123 case Instruction::UDiv: 7124 case Instruction::SDiv: 7125 case Instruction::URem: 7126 case Instruction::SRem: 7127 // If we have a predicated instruction, it may not be executed for each 7128 // vector lane. Get the scalarization cost and scale this amount by the 7129 // probability of executing the predicated block. If the instruction is not 7130 // predicated, we fall through to the next case. 7131 if (VF.isVector() && isScalarWithPredication(I)) { 7132 InstructionCost Cost = 0; 7133 7134 // These instructions have a non-void type, so account for the phi nodes 7135 // that we will create. This cost is likely to be zero. The phi node 7136 // cost, if any, should be scaled by the block probability because it 7137 // models a copy at the end of each predicated block. 7138 Cost += VF.getKnownMinValue() * 7139 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7140 7141 // The cost of the non-predicated instruction. 7142 Cost += VF.getKnownMinValue() * 7143 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7144 7145 // The cost of insertelement and extractelement instructions needed for 7146 // scalarization. 7147 Cost += getScalarizationOverhead(I, VF); 7148 7149 // Scale the cost by the probability of executing the predicated blocks. 7150 // This assumes the predicated block for each vector lane is equally 7151 // likely. 7152 return Cost / getReciprocalPredBlockProb(); 7153 } 7154 LLVM_FALLTHROUGH; 7155 case Instruction::Add: 7156 case Instruction::FAdd: 7157 case Instruction::Sub: 7158 case Instruction::FSub: 7159 case Instruction::Mul: 7160 case Instruction::FMul: 7161 case Instruction::FDiv: 7162 case Instruction::FRem: 7163 case Instruction::Shl: 7164 case Instruction::LShr: 7165 case Instruction::AShr: 7166 case Instruction::And: 7167 case Instruction::Or: 7168 case Instruction::Xor: { 7169 // Since we will replace the stride by 1 the multiplication should go away. 7170 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7171 return 0; 7172 // Certain instructions can be cheaper to vectorize if they have a constant 7173 // second vector operand. One example of this are shifts on x86. 7174 Value *Op2 = I->getOperand(1); 7175 TargetTransformInfo::OperandValueProperties Op2VP; 7176 TargetTransformInfo::OperandValueKind Op2VK = 7177 TTI.getOperandInfo(Op2, Op2VP); 7178 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7179 Op2VK = TargetTransformInfo::OK_UniformValue; 7180 7181 SmallVector<const Value *, 4> Operands(I->operand_values()); 7182 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7183 return N * TTI.getArithmeticInstrCost( 7184 I->getOpcode(), VectorTy, CostKind, 7185 TargetTransformInfo::OK_AnyValue, 7186 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7187 } 7188 case Instruction::FNeg: { 7189 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7190 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7191 return N * TTI.getArithmeticInstrCost( 7192 I->getOpcode(), VectorTy, CostKind, 7193 TargetTransformInfo::OK_AnyValue, 7194 TargetTransformInfo::OK_AnyValue, 7195 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7196 I->getOperand(0), I); 7197 } 7198 case Instruction::Select: { 7199 SelectInst *SI = cast<SelectInst>(I); 7200 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7201 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7202 Type *CondTy = SI->getCondition()->getType(); 7203 if (!ScalarCond) { 7204 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7205 CondTy = VectorType::get(CondTy, VF); 7206 } 7207 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7208 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7209 } 7210 case Instruction::ICmp: 7211 case Instruction::FCmp: { 7212 Type *ValTy = I->getOperand(0)->getType(); 7213 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7214 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7215 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7216 VectorTy = ToVectorTy(ValTy, VF); 7217 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7218 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7219 } 7220 case Instruction::Store: 7221 case Instruction::Load: { 7222 ElementCount Width = VF; 7223 if (Width.isVector()) { 7224 InstWidening Decision = getWideningDecision(I, Width); 7225 assert(Decision != CM_Unknown && 7226 "CM decision should be taken at this point"); 7227 if (Decision == CM_Scalarize) 7228 Width = ElementCount::getFixed(1); 7229 } 7230 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7231 return getMemoryInstructionCost(I, VF); 7232 } 7233 case Instruction::ZExt: 7234 case Instruction::SExt: 7235 case Instruction::FPToUI: 7236 case Instruction::FPToSI: 7237 case Instruction::FPExt: 7238 case Instruction::PtrToInt: 7239 case Instruction::IntToPtr: 7240 case Instruction::SIToFP: 7241 case Instruction::UIToFP: 7242 case Instruction::Trunc: 7243 case Instruction::FPTrunc: 7244 case Instruction::BitCast: { 7245 // Computes the CastContextHint from a Load/Store instruction. 7246 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7247 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7248 "Expected a load or a store!"); 7249 7250 if (VF.isScalar() || !TheLoop->contains(I)) 7251 return TTI::CastContextHint::Normal; 7252 7253 switch (getWideningDecision(I, VF)) { 7254 case LoopVectorizationCostModel::CM_GatherScatter: 7255 return TTI::CastContextHint::GatherScatter; 7256 case LoopVectorizationCostModel::CM_Interleave: 7257 return TTI::CastContextHint::Interleave; 7258 case LoopVectorizationCostModel::CM_Scalarize: 7259 case LoopVectorizationCostModel::CM_Widen: 7260 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7261 : TTI::CastContextHint::Normal; 7262 case LoopVectorizationCostModel::CM_Widen_Reverse: 7263 return TTI::CastContextHint::Reversed; 7264 case LoopVectorizationCostModel::CM_Unknown: 7265 llvm_unreachable("Instr did not go through cost modelling?"); 7266 } 7267 7268 llvm_unreachable("Unhandled case!"); 7269 }; 7270 7271 unsigned Opcode = I->getOpcode(); 7272 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7273 // For Trunc, the context is the only user, which must be a StoreInst. 7274 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7275 if (I->hasOneUse()) 7276 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7277 CCH = ComputeCCH(Store); 7278 } 7279 // For Z/Sext, the context is the operand, which must be a LoadInst. 7280 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7281 Opcode == Instruction::FPExt) { 7282 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7283 CCH = ComputeCCH(Load); 7284 } 7285 7286 // We optimize the truncation of induction variables having constant 7287 // integer steps. The cost of these truncations is the same as the scalar 7288 // operation. 7289 if (isOptimizableIVTruncate(I, VF)) { 7290 auto *Trunc = cast<TruncInst>(I); 7291 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7292 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7293 } 7294 7295 Type *SrcScalarTy = I->getOperand(0)->getType(); 7296 Type *SrcVecTy = 7297 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7298 if (canTruncateToMinimalBitwidth(I, VF)) { 7299 // This cast is going to be shrunk. This may remove the cast or it might 7300 // turn it into slightly different cast. For example, if MinBW == 16, 7301 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7302 // 7303 // Calculate the modified src and dest types. 7304 Type *MinVecTy = VectorTy; 7305 if (Opcode == Instruction::Trunc) { 7306 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7307 VectorTy = 7308 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7309 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7310 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7311 VectorTy = 7312 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7313 } 7314 } 7315 7316 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7317 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7318 return N * 7319 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7320 } 7321 case Instruction::Call: { 7322 bool NeedToScalarize; 7323 CallInst *CI = cast<CallInst>(I); 7324 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7325 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7326 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7327 return std::min(CallCost, IntrinsicCost); 7328 } 7329 return CallCost; 7330 } 7331 case Instruction::ExtractValue: 7332 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7333 default: 7334 // The cost of executing VF copies of the scalar instruction. This opcode 7335 // is unknown. Assume that it is the same as 'mul'. 7336 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7337 Instruction::Mul, VectorTy, CostKind) + 7338 getScalarizationOverhead(I, VF); 7339 } // end of switch. 7340 } 7341 7342 char LoopVectorize::ID = 0; 7343 7344 static const char lv_name[] = "Loop Vectorization"; 7345 7346 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7347 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7348 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7349 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7350 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7351 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7352 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7353 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7354 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7355 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7356 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7357 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7358 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7359 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7360 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7361 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7362 7363 namespace llvm { 7364 7365 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7366 7367 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7368 bool VectorizeOnlyWhenForced) { 7369 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7370 } 7371 7372 } // end namespace llvm 7373 7374 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7375 // Check if the pointer operand of a load or store instruction is 7376 // consecutive. 7377 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7378 return Legal->isConsecutivePtr(Ptr); 7379 return false; 7380 } 7381 7382 void LoopVectorizationCostModel::collectValuesToIgnore() { 7383 // Ignore ephemeral values. 7384 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7385 7386 // Ignore type-promoting instructions we identified during reduction 7387 // detection. 7388 for (auto &Reduction : Legal->getReductionVars()) { 7389 RecurrenceDescriptor &RedDes = Reduction.second; 7390 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7391 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7392 } 7393 // Ignore type-casting instructions we identified during induction 7394 // detection. 7395 for (auto &Induction : Legal->getInductionVars()) { 7396 InductionDescriptor &IndDes = Induction.second; 7397 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7398 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7399 } 7400 } 7401 7402 void LoopVectorizationCostModel::collectInLoopReductions() { 7403 for (auto &Reduction : Legal->getReductionVars()) { 7404 PHINode *Phi = Reduction.first; 7405 RecurrenceDescriptor &RdxDesc = Reduction.second; 7406 7407 // We don't collect reductions that are type promoted (yet). 7408 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7409 continue; 7410 7411 // If the target would prefer this reduction to happen "in-loop", then we 7412 // want to record it as such. 7413 unsigned Opcode = RdxDesc.getOpcode(); 7414 if (!PreferInLoopReductions && 7415 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7416 TargetTransformInfo::ReductionFlags())) 7417 continue; 7418 7419 // Check that we can correctly put the reductions into the loop, by 7420 // finding the chain of operations that leads from the phi to the loop 7421 // exit value. 7422 SmallVector<Instruction *, 4> ReductionOperations = 7423 RdxDesc.getReductionOpChain(Phi, TheLoop); 7424 bool InLoop = !ReductionOperations.empty(); 7425 if (InLoop) 7426 InLoopReductionChains[Phi] = ReductionOperations; 7427 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7428 << " reduction for phi: " << *Phi << "\n"); 7429 } 7430 } 7431 7432 // TODO: we could return a pair of values that specify the max VF and 7433 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7434 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7435 // doesn't have a cost model that can choose which plan to execute if 7436 // more than one is generated. 7437 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7438 LoopVectorizationCostModel &CM) { 7439 unsigned WidestType; 7440 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7441 return WidestVectorRegBits / WidestType; 7442 } 7443 7444 VectorizationFactor 7445 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7446 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7447 ElementCount VF = UserVF; 7448 // Outer loop handling: They may require CFG and instruction level 7449 // transformations before even evaluating whether vectorization is profitable. 7450 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7451 // the vectorization pipeline. 7452 if (!OrigLoop->isInnermost()) { 7453 // If the user doesn't provide a vectorization factor, determine a 7454 // reasonable one. 7455 if (UserVF.isZero()) { 7456 VF = ElementCount::getFixed( 7457 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7458 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7459 7460 // Make sure we have a VF > 1 for stress testing. 7461 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7462 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7463 << "overriding computed VF.\n"); 7464 VF = ElementCount::getFixed(4); 7465 } 7466 } 7467 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7468 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7469 "VF needs to be a power of two"); 7470 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7471 << "VF " << VF << " to build VPlans.\n"); 7472 buildVPlans(VF, VF); 7473 7474 // For VPlan build stress testing, we bail out after VPlan construction. 7475 if (VPlanBuildStressTest) 7476 return VectorizationFactor::Disabled(); 7477 7478 return {VF, 0 /*Cost*/}; 7479 } 7480 7481 LLVM_DEBUG( 7482 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7483 "VPlan-native path.\n"); 7484 return VectorizationFactor::Disabled(); 7485 } 7486 7487 Optional<VectorizationFactor> 7488 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7489 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7490 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7491 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7492 return None; 7493 7494 // Invalidate interleave groups if all blocks of loop will be predicated. 7495 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7496 !useMaskedInterleavedAccesses(*TTI)) { 7497 LLVM_DEBUG( 7498 dbgs() 7499 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7500 "which requires masked-interleaved support.\n"); 7501 if (CM.InterleaveInfo.invalidateGroups()) 7502 // Invalidating interleave groups also requires invalidating all decisions 7503 // based on them, which includes widening decisions and uniform and scalar 7504 // values. 7505 CM.invalidateCostModelingDecisions(); 7506 } 7507 7508 ElementCount MaxVF = MaybeMaxVF.getValue(); 7509 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7510 7511 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7512 if (!UserVF.isZero() && 7513 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7514 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7515 // VFs here, this should be reverted to only use legal UserVFs once the 7516 // loop below supports scalable VFs. 7517 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7518 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7519 << " VF " << VF << ".\n"); 7520 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7521 "VF needs to be a power of two"); 7522 // Collect the instructions (and their associated costs) that will be more 7523 // profitable to scalarize. 7524 CM.selectUserVectorizationFactor(VF); 7525 CM.collectInLoopReductions(); 7526 buildVPlansWithVPRecipes(VF, VF); 7527 LLVM_DEBUG(printPlans(dbgs())); 7528 return {{VF, 0}}; 7529 } 7530 7531 assert(!MaxVF.isScalable() && 7532 "Scalable vectors not yet supported beyond this point"); 7533 7534 for (ElementCount VF = ElementCount::getFixed(1); 7535 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7536 // Collect Uniform and Scalar instructions after vectorization with VF. 7537 CM.collectUniformsAndScalars(VF); 7538 7539 // Collect the instructions (and their associated costs) that will be more 7540 // profitable to scalarize. 7541 if (VF.isVector()) 7542 CM.collectInstsToScalarize(VF); 7543 } 7544 7545 CM.collectInLoopReductions(); 7546 7547 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7548 LLVM_DEBUG(printPlans(dbgs())); 7549 if (MaxVF.isScalar()) 7550 return VectorizationFactor::Disabled(); 7551 7552 // Select the optimal vectorization factor. 7553 return CM.selectVectorizationFactor(MaxVF); 7554 } 7555 7556 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7557 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7558 << '\n'); 7559 BestVF = VF; 7560 BestUF = UF; 7561 7562 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7563 return !Plan->hasVF(VF); 7564 }); 7565 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7566 } 7567 7568 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7569 DominatorTree *DT) { 7570 // Perform the actual loop transformation. 7571 7572 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7573 VPCallbackILV CallbackILV(ILV); 7574 7575 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7576 7577 VPTransformState State{*BestVF, BestUF, LI, 7578 DT, ILV.Builder, ILV.VectorLoopValueMap, 7579 &ILV, CallbackILV}; 7580 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7581 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7582 State.CanonicalIV = ILV.Induction; 7583 7584 ILV.printDebugTracesAtStart(); 7585 7586 //===------------------------------------------------===// 7587 // 7588 // Notice: any optimization or new instruction that go 7589 // into the code below should also be implemented in 7590 // the cost-model. 7591 // 7592 //===------------------------------------------------===// 7593 7594 // 2. Copy and widen instructions from the old loop into the new loop. 7595 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7596 VPlans.front()->execute(&State); 7597 7598 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7599 // predication, updating analyses. 7600 ILV.fixVectorizedLoop(); 7601 7602 ILV.printDebugTracesAtEnd(); 7603 } 7604 7605 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7606 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7607 7608 // We create new control-flow for the vectorized loop, so the original exit 7609 // conditions will be dead after vectorization if it's only used by the 7610 // terminator 7611 SmallVector<BasicBlock*> ExitingBlocks; 7612 OrigLoop->getExitingBlocks(ExitingBlocks); 7613 for (auto *BB : ExitingBlocks) { 7614 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7615 if (!Cmp || !Cmp->hasOneUse()) 7616 continue; 7617 7618 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7619 if (!DeadInstructions.insert(Cmp).second) 7620 continue; 7621 7622 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7623 // TODO: can recurse through operands in general 7624 for (Value *Op : Cmp->operands()) { 7625 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7626 DeadInstructions.insert(cast<Instruction>(Op)); 7627 } 7628 } 7629 7630 // We create new "steps" for induction variable updates to which the original 7631 // induction variables map. An original update instruction will be dead if 7632 // all its users except the induction variable are dead. 7633 auto *Latch = OrigLoop->getLoopLatch(); 7634 for (auto &Induction : Legal->getInductionVars()) { 7635 PHINode *Ind = Induction.first; 7636 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7637 7638 // If the tail is to be folded by masking, the primary induction variable, 7639 // if exists, isn't dead: it will be used for masking. Don't kill it. 7640 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7641 continue; 7642 7643 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7644 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7645 })) 7646 DeadInstructions.insert(IndUpdate); 7647 7648 // We record as "Dead" also the type-casting instructions we had identified 7649 // during induction analysis. We don't need any handling for them in the 7650 // vectorized loop because we have proven that, under a proper runtime 7651 // test guarding the vectorized loop, the value of the phi, and the casted 7652 // value of the phi, are the same. The last instruction in this casting chain 7653 // will get its scalar/vector/widened def from the scalar/vector/widened def 7654 // of the respective phi node. Any other casts in the induction def-use chain 7655 // have no other uses outside the phi update chain, and will be ignored. 7656 InductionDescriptor &IndDes = Induction.second; 7657 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7658 DeadInstructions.insert(Casts.begin(), Casts.end()); 7659 } 7660 } 7661 7662 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7663 7664 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7665 7666 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7667 Instruction::BinaryOps BinOp) { 7668 // When unrolling and the VF is 1, we only need to add a simple scalar. 7669 Type *Ty = Val->getType(); 7670 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7671 7672 if (Ty->isFloatingPointTy()) { 7673 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7674 7675 // Floating point operations had to be 'fast' to enable the unrolling. 7676 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7677 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7678 } 7679 Constant *C = ConstantInt::get(Ty, StartIdx); 7680 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7681 } 7682 7683 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7684 SmallVector<Metadata *, 4> MDs; 7685 // Reserve first location for self reference to the LoopID metadata node. 7686 MDs.push_back(nullptr); 7687 bool IsUnrollMetadata = false; 7688 MDNode *LoopID = L->getLoopID(); 7689 if (LoopID) { 7690 // First find existing loop unrolling disable metadata. 7691 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7692 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7693 if (MD) { 7694 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7695 IsUnrollMetadata = 7696 S && S->getString().startswith("llvm.loop.unroll.disable"); 7697 } 7698 MDs.push_back(LoopID->getOperand(i)); 7699 } 7700 } 7701 7702 if (!IsUnrollMetadata) { 7703 // Add runtime unroll disable metadata. 7704 LLVMContext &Context = L->getHeader()->getContext(); 7705 SmallVector<Metadata *, 1> DisableOperands; 7706 DisableOperands.push_back( 7707 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7708 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7709 MDs.push_back(DisableNode); 7710 MDNode *NewLoopID = MDNode::get(Context, MDs); 7711 // Set operand 0 to refer to the loop id itself. 7712 NewLoopID->replaceOperandWith(0, NewLoopID); 7713 L->setLoopID(NewLoopID); 7714 } 7715 } 7716 7717 //===--------------------------------------------------------------------===// 7718 // EpilogueVectorizerMainLoop 7719 //===--------------------------------------------------------------------===// 7720 7721 /// This function is partially responsible for generating the control flow 7722 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7723 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7724 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7725 Loop *Lp = createVectorLoopSkeleton(""); 7726 7727 // Generate the code to check the minimum iteration count of the vector 7728 // epilogue (see below). 7729 EPI.EpilogueIterationCountCheck = 7730 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7731 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7732 7733 // Generate the code to check any assumptions that we've made for SCEV 7734 // expressions. 7735 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7736 emitSCEVChecks(Lp, LoopScalarPreHeader); 7737 7738 // If a safety check was generated save it. 7739 if (SavedPreHeader != LoopVectorPreHeader) 7740 EPI.SCEVSafetyCheck = SavedPreHeader; 7741 7742 // Generate the code that checks at runtime if arrays overlap. We put the 7743 // checks into a separate block to make the more common case of few elements 7744 // faster. 7745 SavedPreHeader = LoopVectorPreHeader; 7746 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7747 7748 // If a safety check was generated save/overwite it. 7749 if (SavedPreHeader != LoopVectorPreHeader) 7750 EPI.MemSafetyCheck = SavedPreHeader; 7751 7752 // Generate the iteration count check for the main loop, *after* the check 7753 // for the epilogue loop, so that the path-length is shorter for the case 7754 // that goes directly through the vector epilogue. The longer-path length for 7755 // the main loop is compensated for, by the gain from vectorizing the larger 7756 // trip count. Note: the branch will get updated later on when we vectorize 7757 // the epilogue. 7758 EPI.MainLoopIterationCountCheck = 7759 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7760 7761 // Generate the induction variable. 7762 OldInduction = Legal->getPrimaryInduction(); 7763 Type *IdxTy = Legal->getWidestInductionType(); 7764 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7765 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7766 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7767 EPI.VectorTripCount = CountRoundDown; 7768 Induction = 7769 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7770 getDebugLocFromInstOrOperands(OldInduction)); 7771 7772 // Skip induction resume value creation here because they will be created in 7773 // the second pass. If we created them here, they wouldn't be used anyway, 7774 // because the vplan in the second pass still contains the inductions from the 7775 // original loop. 7776 7777 return completeLoopSkeleton(Lp, OrigLoopID); 7778 } 7779 7780 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7781 LLVM_DEBUG({ 7782 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7783 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7784 << ", Main Loop UF:" << EPI.MainLoopUF 7785 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7786 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7787 }); 7788 } 7789 7790 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7791 DEBUG_WITH_TYPE(VerboseDebug, { 7792 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7793 }); 7794 } 7795 7796 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7797 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7798 assert(L && "Expected valid Loop."); 7799 assert(Bypass && "Expected valid bypass basic block."); 7800 unsigned VFactor = 7801 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7802 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7803 Value *Count = getOrCreateTripCount(L); 7804 // Reuse existing vector loop preheader for TC checks. 7805 // Note that new preheader block is generated for vector loop. 7806 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7807 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7808 7809 // Generate code to check if the loop's trip count is less than VF * UF of the 7810 // main vector loop. 7811 auto P = 7812 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7813 7814 Value *CheckMinIters = Builder.CreateICmp( 7815 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7816 "min.iters.check"); 7817 7818 if (!ForEpilogue) 7819 TCCheckBlock->setName("vector.main.loop.iter.check"); 7820 7821 // Create new preheader for vector loop. 7822 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7823 DT, LI, nullptr, "vector.ph"); 7824 7825 if (ForEpilogue) { 7826 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7827 DT->getNode(Bypass)->getIDom()) && 7828 "TC check is expected to dominate Bypass"); 7829 7830 // Update dominator for Bypass & LoopExit. 7831 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7832 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7833 7834 LoopBypassBlocks.push_back(TCCheckBlock); 7835 7836 // Save the trip count so we don't have to regenerate it in the 7837 // vec.epilog.iter.check. This is safe to do because the trip count 7838 // generated here dominates the vector epilog iter check. 7839 EPI.TripCount = Count; 7840 } 7841 7842 ReplaceInstWithInst( 7843 TCCheckBlock->getTerminator(), 7844 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7845 7846 return TCCheckBlock; 7847 } 7848 7849 //===--------------------------------------------------------------------===// 7850 // EpilogueVectorizerEpilogueLoop 7851 //===--------------------------------------------------------------------===// 7852 7853 /// This function is partially responsible for generating the control flow 7854 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7855 BasicBlock * 7856 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7857 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7858 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7859 7860 // Now, compare the remaining count and if there aren't enough iterations to 7861 // execute the vectorized epilogue skip to the scalar part. 7862 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7863 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7864 LoopVectorPreHeader = 7865 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7866 LI, nullptr, "vec.epilog.ph"); 7867 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7868 VecEpilogueIterationCountCheck); 7869 7870 // Adjust the control flow taking the state info from the main loop 7871 // vectorization into account. 7872 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7873 "expected this to be saved from the previous pass."); 7874 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7875 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7876 7877 DT->changeImmediateDominator(LoopVectorPreHeader, 7878 EPI.MainLoopIterationCountCheck); 7879 7880 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7881 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7882 7883 if (EPI.SCEVSafetyCheck) 7884 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7885 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7886 if (EPI.MemSafetyCheck) 7887 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7888 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7889 7890 DT->changeImmediateDominator( 7891 VecEpilogueIterationCountCheck, 7892 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7893 7894 DT->changeImmediateDominator(LoopScalarPreHeader, 7895 EPI.EpilogueIterationCountCheck); 7896 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7897 7898 // Keep track of bypass blocks, as they feed start values to the induction 7899 // phis in the scalar loop preheader. 7900 if (EPI.SCEVSafetyCheck) 7901 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7902 if (EPI.MemSafetyCheck) 7903 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7904 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7905 7906 // Generate a resume induction for the vector epilogue and put it in the 7907 // vector epilogue preheader 7908 Type *IdxTy = Legal->getWidestInductionType(); 7909 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7910 LoopVectorPreHeader->getFirstNonPHI()); 7911 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7912 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7913 EPI.MainLoopIterationCountCheck); 7914 7915 // Generate the induction variable. 7916 OldInduction = Legal->getPrimaryInduction(); 7917 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7918 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7919 Value *StartIdx = EPResumeVal; 7920 Induction = 7921 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7922 getDebugLocFromInstOrOperands(OldInduction)); 7923 7924 // Generate induction resume values. These variables save the new starting 7925 // indexes for the scalar loop. They are used to test if there are any tail 7926 // iterations left once the vector loop has completed. 7927 // Note that when the vectorized epilogue is skipped due to iteration count 7928 // check, then the resume value for the induction variable comes from 7929 // the trip count of the main vector loop, hence passing the AdditionalBypass 7930 // argument. 7931 createInductionResumeValues(Lp, CountRoundDown, 7932 {VecEpilogueIterationCountCheck, 7933 EPI.VectorTripCount} /* AdditionalBypass */); 7934 7935 AddRuntimeUnrollDisableMetaData(Lp); 7936 return completeLoopSkeleton(Lp, OrigLoopID); 7937 } 7938 7939 BasicBlock * 7940 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7941 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7942 7943 assert(EPI.TripCount && 7944 "Expected trip count to have been safed in the first pass."); 7945 assert( 7946 (!isa<Instruction>(EPI.TripCount) || 7947 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7948 "saved trip count does not dominate insertion point."); 7949 Value *TC = EPI.TripCount; 7950 IRBuilder<> Builder(Insert->getTerminator()); 7951 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7952 7953 // Generate code to check if the loop's trip count is less than VF * UF of the 7954 // vector epilogue loop. 7955 auto P = 7956 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7957 7958 Value *CheckMinIters = Builder.CreateICmp( 7959 P, Count, 7960 ConstantInt::get(Count->getType(), 7961 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7962 "min.epilog.iters.check"); 7963 7964 ReplaceInstWithInst( 7965 Insert->getTerminator(), 7966 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7967 7968 LoopBypassBlocks.push_back(Insert); 7969 return Insert; 7970 } 7971 7972 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7973 LLVM_DEBUG({ 7974 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7975 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7976 << ", Main Loop UF:" << EPI.MainLoopUF 7977 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7978 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7979 }); 7980 } 7981 7982 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7983 DEBUG_WITH_TYPE(VerboseDebug, { 7984 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7985 }); 7986 } 7987 7988 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7989 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7990 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7991 bool PredicateAtRangeStart = Predicate(Range.Start); 7992 7993 for (ElementCount TmpVF = Range.Start * 2; 7994 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7995 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7996 Range.End = TmpVF; 7997 break; 7998 } 7999 8000 return PredicateAtRangeStart; 8001 } 8002 8003 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8004 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8005 /// of VF's starting at a given VF and extending it as much as possible. Each 8006 /// vectorization decision can potentially shorten this sub-range during 8007 /// buildVPlan(). 8008 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8009 ElementCount MaxVF) { 8010 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8011 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8012 VFRange SubRange = {VF, MaxVFPlusOne}; 8013 VPlans.push_back(buildVPlan(SubRange)); 8014 VF = SubRange.End; 8015 } 8016 } 8017 8018 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8019 VPlanPtr &Plan) { 8020 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8021 8022 // Look for cached value. 8023 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8024 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8025 if (ECEntryIt != EdgeMaskCache.end()) 8026 return ECEntryIt->second; 8027 8028 VPValue *SrcMask = createBlockInMask(Src, Plan); 8029 8030 // The terminator has to be a branch inst! 8031 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8032 assert(BI && "Unexpected terminator found"); 8033 8034 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8035 return EdgeMaskCache[Edge] = SrcMask; 8036 8037 // If source is an exiting block, we know the exit edge is dynamically dead 8038 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8039 // adding uses of an otherwise potentially dead instruction. 8040 if (OrigLoop->isLoopExiting(Src)) 8041 return EdgeMaskCache[Edge] = SrcMask; 8042 8043 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8044 assert(EdgeMask && "No Edge Mask found for condition"); 8045 8046 if (BI->getSuccessor(0) != Dst) 8047 EdgeMask = Builder.createNot(EdgeMask); 8048 8049 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8050 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8051 8052 return EdgeMaskCache[Edge] = EdgeMask; 8053 } 8054 8055 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8056 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8057 8058 // Look for cached value. 8059 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8060 if (BCEntryIt != BlockMaskCache.end()) 8061 return BCEntryIt->second; 8062 8063 // All-one mask is modelled as no-mask following the convention for masked 8064 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8065 VPValue *BlockMask = nullptr; 8066 8067 if (OrigLoop->getHeader() == BB) { 8068 if (!CM.blockNeedsPredication(BB)) 8069 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8070 8071 // Create the block in mask as the first non-phi instruction in the block. 8072 VPBuilder::InsertPointGuard Guard(Builder); 8073 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8074 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8075 8076 // Introduce the early-exit compare IV <= BTC to form header block mask. 8077 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8078 // Start by constructing the desired canonical IV. 8079 VPValue *IV = nullptr; 8080 if (Legal->getPrimaryInduction()) 8081 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8082 else { 8083 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8084 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8085 IV = IVRecipe->getVPValue(); 8086 } 8087 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8088 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8089 8090 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8091 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8092 // as a second argument, we only pass the IV here and extract the 8093 // tripcount from the transform state where codegen of the VP instructions 8094 // happen. 8095 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8096 } else { 8097 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8098 } 8099 return BlockMaskCache[BB] = BlockMask; 8100 } 8101 8102 // This is the block mask. We OR all incoming edges. 8103 for (auto *Predecessor : predecessors(BB)) { 8104 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8105 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8106 return BlockMaskCache[BB] = EdgeMask; 8107 8108 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8109 BlockMask = EdgeMask; 8110 continue; 8111 } 8112 8113 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8114 } 8115 8116 return BlockMaskCache[BB] = BlockMask; 8117 } 8118 8119 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8120 VPlanPtr &Plan) { 8121 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8122 "Must be called with either a load or store"); 8123 8124 auto willWiden = [&](ElementCount VF) -> bool { 8125 if (VF.isScalar()) 8126 return false; 8127 LoopVectorizationCostModel::InstWidening Decision = 8128 CM.getWideningDecision(I, VF); 8129 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8130 "CM decision should be taken at this point."); 8131 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8132 return true; 8133 if (CM.isScalarAfterVectorization(I, VF) || 8134 CM.isProfitableToScalarize(I, VF)) 8135 return false; 8136 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8137 }; 8138 8139 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8140 return nullptr; 8141 8142 VPValue *Mask = nullptr; 8143 if (Legal->isMaskRequired(I)) 8144 Mask = createBlockInMask(I->getParent(), Plan); 8145 8146 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8147 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8148 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8149 8150 StoreInst *Store = cast<StoreInst>(I); 8151 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8152 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8153 } 8154 8155 VPWidenIntOrFpInductionRecipe * 8156 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8157 // Check if this is an integer or fp induction. If so, build the recipe that 8158 // produces its scalar and vector values. 8159 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8160 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8161 II.getKind() == InductionDescriptor::IK_FpInduction) { 8162 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8163 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8164 } 8165 8166 return nullptr; 8167 } 8168 8169 VPWidenIntOrFpInductionRecipe * 8170 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8171 VPlan &Plan) const { 8172 // Optimize the special case where the source is a constant integer 8173 // induction variable. Notice that we can only optimize the 'trunc' case 8174 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8175 // (c) other casts depend on pointer size. 8176 8177 // Determine whether \p K is a truncation based on an induction variable that 8178 // can be optimized. 8179 auto isOptimizableIVTruncate = 8180 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8181 return [=](ElementCount VF) -> bool { 8182 return CM.isOptimizableIVTruncate(K, VF); 8183 }; 8184 }; 8185 8186 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8187 isOptimizableIVTruncate(I), Range)) { 8188 8189 InductionDescriptor II = 8190 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8191 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8192 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8193 Start, I); 8194 } 8195 return nullptr; 8196 } 8197 8198 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8199 // We know that all PHIs in non-header blocks are converted into selects, so 8200 // we don't have to worry about the insertion order and we can just use the 8201 // builder. At this point we generate the predication tree. There may be 8202 // duplications since this is a simple recursive scan, but future 8203 // optimizations will clean it up. 8204 8205 SmallVector<VPValue *, 2> Operands; 8206 unsigned NumIncoming = Phi->getNumIncomingValues(); 8207 for (unsigned In = 0; In < NumIncoming; In++) { 8208 VPValue *EdgeMask = 8209 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8210 assert((EdgeMask || NumIncoming == 1) && 8211 "Multiple predecessors with one having a full mask"); 8212 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8213 if (EdgeMask) 8214 Operands.push_back(EdgeMask); 8215 } 8216 return new VPBlendRecipe(Phi, Operands); 8217 } 8218 8219 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8220 VPlan &Plan) const { 8221 8222 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8223 [this, CI](ElementCount VF) { 8224 return CM.isScalarWithPredication(CI, VF); 8225 }, 8226 Range); 8227 8228 if (IsPredicated) 8229 return nullptr; 8230 8231 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8232 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8233 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8234 ID == Intrinsic::pseudoprobe || 8235 ID == Intrinsic::experimental_noalias_scope_decl)) 8236 return nullptr; 8237 8238 auto willWiden = [&](ElementCount VF) -> bool { 8239 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8240 // The following case may be scalarized depending on the VF. 8241 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8242 // version of the instruction. 8243 // Is it beneficial to perform intrinsic call compared to lib call? 8244 bool NeedToScalarize = false; 8245 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8246 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8247 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8248 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8249 "Cannot have invalid costs while widening"); 8250 return UseVectorIntrinsic || !NeedToScalarize; 8251 }; 8252 8253 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8254 return nullptr; 8255 8256 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8257 } 8258 8259 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8260 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8261 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8262 // Instruction should be widened, unless it is scalar after vectorization, 8263 // scalarization is profitable or it is predicated. 8264 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8265 return CM.isScalarAfterVectorization(I, VF) || 8266 CM.isProfitableToScalarize(I, VF) || 8267 CM.isScalarWithPredication(I, VF); 8268 }; 8269 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8270 Range); 8271 } 8272 8273 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8274 auto IsVectorizableOpcode = [](unsigned Opcode) { 8275 switch (Opcode) { 8276 case Instruction::Add: 8277 case Instruction::And: 8278 case Instruction::AShr: 8279 case Instruction::BitCast: 8280 case Instruction::FAdd: 8281 case Instruction::FCmp: 8282 case Instruction::FDiv: 8283 case Instruction::FMul: 8284 case Instruction::FNeg: 8285 case Instruction::FPExt: 8286 case Instruction::FPToSI: 8287 case Instruction::FPToUI: 8288 case Instruction::FPTrunc: 8289 case Instruction::FRem: 8290 case Instruction::FSub: 8291 case Instruction::ICmp: 8292 case Instruction::IntToPtr: 8293 case Instruction::LShr: 8294 case Instruction::Mul: 8295 case Instruction::Or: 8296 case Instruction::PtrToInt: 8297 case Instruction::SDiv: 8298 case Instruction::Select: 8299 case Instruction::SExt: 8300 case Instruction::Shl: 8301 case Instruction::SIToFP: 8302 case Instruction::SRem: 8303 case Instruction::Sub: 8304 case Instruction::Trunc: 8305 case Instruction::UDiv: 8306 case Instruction::UIToFP: 8307 case Instruction::URem: 8308 case Instruction::Xor: 8309 case Instruction::ZExt: 8310 return true; 8311 } 8312 return false; 8313 }; 8314 8315 if (!IsVectorizableOpcode(I->getOpcode())) 8316 return nullptr; 8317 8318 // Success: widen this instruction. 8319 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8320 } 8321 8322 VPBasicBlock *VPRecipeBuilder::handleReplication( 8323 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8324 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8325 VPlanPtr &Plan) { 8326 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8327 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8328 Range); 8329 8330 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8331 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8332 Range); 8333 8334 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8335 IsUniform, IsPredicated); 8336 setRecipe(I, Recipe); 8337 Plan->addVPValue(I, Recipe); 8338 8339 // Find if I uses a predicated instruction. If so, it will use its scalar 8340 // value. Avoid hoisting the insert-element which packs the scalar value into 8341 // a vector value, as that happens iff all users use the vector value. 8342 for (auto &Op : I->operands()) 8343 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8344 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8345 PredInst2Recipe[PredInst]->setAlsoPack(false); 8346 8347 // Finalize the recipe for Instr, first if it is not predicated. 8348 if (!IsPredicated) { 8349 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8350 VPBB->appendRecipe(Recipe); 8351 return VPBB; 8352 } 8353 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8354 assert(VPBB->getSuccessors().empty() && 8355 "VPBB has successors when handling predicated replication."); 8356 // Record predicated instructions for above packing optimizations. 8357 PredInst2Recipe[I] = Recipe; 8358 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8359 VPBlockUtils::insertBlockAfter(Region, VPBB); 8360 auto *RegSucc = new VPBasicBlock(); 8361 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8362 return RegSucc; 8363 } 8364 8365 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8366 VPRecipeBase *PredRecipe, 8367 VPlanPtr &Plan) { 8368 // Instructions marked for predication are replicated and placed under an 8369 // if-then construct to prevent side-effects. 8370 8371 // Generate recipes to compute the block mask for this region. 8372 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8373 8374 // Build the triangular if-then region. 8375 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8376 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8377 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8378 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8379 auto *PHIRecipe = Instr->getType()->isVoidTy() 8380 ? nullptr 8381 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8382 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8383 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8384 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8385 8386 // Note: first set Entry as region entry and then connect successors starting 8387 // from it in order, to propagate the "parent" of each VPBasicBlock. 8388 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8389 VPBlockUtils::connectBlocks(Pred, Exit); 8390 8391 return Region; 8392 } 8393 8394 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8395 VFRange &Range, 8396 VPlanPtr &Plan) { 8397 // First, check for specific widening recipes that deal with calls, memory 8398 // operations, inductions and Phi nodes. 8399 if (auto *CI = dyn_cast<CallInst>(Instr)) 8400 return tryToWidenCall(CI, Range, *Plan); 8401 8402 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8403 return tryToWidenMemory(Instr, Range, Plan); 8404 8405 VPRecipeBase *Recipe; 8406 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8407 if (Phi->getParent() != OrigLoop->getHeader()) 8408 return tryToBlend(Phi, Plan); 8409 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8410 return Recipe; 8411 8412 if (Legal->isReductionVariable(Phi)) { 8413 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8414 VPValue *StartV = 8415 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8416 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8417 } 8418 8419 return new VPWidenPHIRecipe(Phi); 8420 } 8421 8422 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8423 cast<TruncInst>(Instr), Range, *Plan))) 8424 return Recipe; 8425 8426 if (!shouldWiden(Instr, Range)) 8427 return nullptr; 8428 8429 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8430 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8431 OrigLoop); 8432 8433 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8434 bool InvariantCond = 8435 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8436 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8437 InvariantCond); 8438 } 8439 8440 return tryToWiden(Instr, *Plan); 8441 } 8442 8443 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8444 ElementCount MaxVF) { 8445 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8446 8447 // Collect instructions from the original loop that will become trivially dead 8448 // in the vectorized loop. We don't need to vectorize these instructions. For 8449 // example, original induction update instructions can become dead because we 8450 // separately emit induction "steps" when generating code for the new loop. 8451 // Similarly, we create a new latch condition when setting up the structure 8452 // of the new loop, so the old one can become dead. 8453 SmallPtrSet<Instruction *, 4> DeadInstructions; 8454 collectTriviallyDeadInstructions(DeadInstructions); 8455 8456 // Add assume instructions we need to drop to DeadInstructions, to prevent 8457 // them from being added to the VPlan. 8458 // TODO: We only need to drop assumes in blocks that get flattend. If the 8459 // control flow is preserved, we should keep them. 8460 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8461 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8462 8463 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8464 // Dead instructions do not need sinking. Remove them from SinkAfter. 8465 for (Instruction *I : DeadInstructions) 8466 SinkAfter.erase(I); 8467 8468 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8469 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8470 VFRange SubRange = {VF, MaxVFPlusOne}; 8471 VPlans.push_back( 8472 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8473 VF = SubRange.End; 8474 } 8475 } 8476 8477 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8478 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8479 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8480 8481 // Hold a mapping from predicated instructions to their recipes, in order to 8482 // fix their AlsoPack behavior if a user is determined to replicate and use a 8483 // scalar instead of vector value. 8484 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8485 8486 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8487 8488 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8489 8490 // --------------------------------------------------------------------------- 8491 // Pre-construction: record ingredients whose recipes we'll need to further 8492 // process after constructing the initial VPlan. 8493 // --------------------------------------------------------------------------- 8494 8495 // Mark instructions we'll need to sink later and their targets as 8496 // ingredients whose recipe we'll need to record. 8497 for (auto &Entry : SinkAfter) { 8498 RecipeBuilder.recordRecipeOf(Entry.first); 8499 RecipeBuilder.recordRecipeOf(Entry.second); 8500 } 8501 for (auto &Reduction : CM.getInLoopReductionChains()) { 8502 PHINode *Phi = Reduction.first; 8503 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8504 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8505 8506 RecipeBuilder.recordRecipeOf(Phi); 8507 for (auto &R : ReductionOperations) { 8508 RecipeBuilder.recordRecipeOf(R); 8509 // For min/max reducitons, where we have a pair of icmp/select, we also 8510 // need to record the ICmp recipe, so it can be removed later. 8511 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8512 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8513 } 8514 } 8515 8516 // For each interleave group which is relevant for this (possibly trimmed) 8517 // Range, add it to the set of groups to be later applied to the VPlan and add 8518 // placeholders for its members' Recipes which we'll be replacing with a 8519 // single VPInterleaveRecipe. 8520 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8521 auto applyIG = [IG, this](ElementCount VF) -> bool { 8522 return (VF.isVector() && // Query is illegal for VF == 1 8523 CM.getWideningDecision(IG->getInsertPos(), VF) == 8524 LoopVectorizationCostModel::CM_Interleave); 8525 }; 8526 if (!getDecisionAndClampRange(applyIG, Range)) 8527 continue; 8528 InterleaveGroups.insert(IG); 8529 for (unsigned i = 0; i < IG->getFactor(); i++) 8530 if (Instruction *Member = IG->getMember(i)) 8531 RecipeBuilder.recordRecipeOf(Member); 8532 }; 8533 8534 // --------------------------------------------------------------------------- 8535 // Build initial VPlan: Scan the body of the loop in a topological order to 8536 // visit each basic block after having visited its predecessor basic blocks. 8537 // --------------------------------------------------------------------------- 8538 8539 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8540 auto Plan = std::make_unique<VPlan>(); 8541 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8542 Plan->setEntry(VPBB); 8543 8544 // Scan the body of the loop in a topological order to visit each basic block 8545 // after having visited its predecessor basic blocks. 8546 LoopBlocksDFS DFS(OrigLoop); 8547 DFS.perform(LI); 8548 8549 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8550 // Relevant instructions from basic block BB will be grouped into VPRecipe 8551 // ingredients and fill a new VPBasicBlock. 8552 unsigned VPBBsForBB = 0; 8553 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8554 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8555 VPBB = FirstVPBBForBB; 8556 Builder.setInsertPoint(VPBB); 8557 8558 // Introduce each ingredient into VPlan. 8559 // TODO: Model and preserve debug instrinsics in VPlan. 8560 for (Instruction &I : BB->instructionsWithoutDebug()) { 8561 Instruction *Instr = &I; 8562 8563 // First filter out irrelevant instructions, to ensure no recipes are 8564 // built for them. 8565 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8566 continue; 8567 8568 if (auto Recipe = 8569 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8570 for (auto *Def : Recipe->definedValues()) { 8571 auto *UV = Def->getUnderlyingValue(); 8572 Plan->addVPValue(UV, Def); 8573 } 8574 8575 RecipeBuilder.setRecipe(Instr, Recipe); 8576 VPBB->appendRecipe(Recipe); 8577 continue; 8578 } 8579 8580 // Otherwise, if all widening options failed, Instruction is to be 8581 // replicated. This may create a successor for VPBB. 8582 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8583 Instr, Range, VPBB, PredInst2Recipe, Plan); 8584 if (NextVPBB != VPBB) { 8585 VPBB = NextVPBB; 8586 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8587 : ""); 8588 } 8589 } 8590 } 8591 8592 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8593 // may also be empty, such as the last one VPBB, reflecting original 8594 // basic-blocks with no recipes. 8595 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8596 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8597 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8598 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8599 delete PreEntry; 8600 8601 // --------------------------------------------------------------------------- 8602 // Transform initial VPlan: Apply previously taken decisions, in order, to 8603 // bring the VPlan to its final state. 8604 // --------------------------------------------------------------------------- 8605 8606 // Apply Sink-After legal constraints. 8607 for (auto &Entry : SinkAfter) { 8608 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8609 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8610 // If the target is in a replication region, make sure to move Sink to the 8611 // block after it, not into the replication region itself. 8612 if (auto *Region = 8613 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8614 if (Region->isReplicator()) { 8615 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8616 VPBasicBlock *NextBlock = 8617 cast<VPBasicBlock>(Region->getSuccessors().front()); 8618 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8619 continue; 8620 } 8621 } 8622 Sink->moveAfter(Target); 8623 } 8624 8625 // Interleave memory: for each Interleave Group we marked earlier as relevant 8626 // for this VPlan, replace the Recipes widening its memory instructions with a 8627 // single VPInterleaveRecipe at its insertion point. 8628 for (auto IG : InterleaveGroups) { 8629 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8630 RecipeBuilder.getRecipe(IG->getInsertPos())); 8631 SmallVector<VPValue *, 4> StoredValues; 8632 for (unsigned i = 0; i < IG->getFactor(); ++i) 8633 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8634 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8635 8636 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8637 Recipe->getMask()); 8638 VPIG->insertBefore(Recipe); 8639 unsigned J = 0; 8640 for (unsigned i = 0; i < IG->getFactor(); ++i) 8641 if (Instruction *Member = IG->getMember(i)) { 8642 if (!Member->getType()->isVoidTy()) { 8643 VPValue *OriginalV = Plan->getVPValue(Member); 8644 Plan->removeVPValueFor(Member); 8645 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8646 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8647 J++; 8648 } 8649 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8650 } 8651 } 8652 8653 // Adjust the recipes for any inloop reductions. 8654 if (Range.Start.isVector()) 8655 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8656 8657 // Finally, if tail is folded by masking, introduce selects between the phi 8658 // and the live-out instruction of each reduction, at the end of the latch. 8659 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8660 Builder.setInsertPoint(VPBB); 8661 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8662 for (auto &Reduction : Legal->getReductionVars()) { 8663 if (CM.isInLoopReduction(Reduction.first)) 8664 continue; 8665 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8666 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8667 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8668 } 8669 } 8670 8671 std::string PlanName; 8672 raw_string_ostream RSO(PlanName); 8673 ElementCount VF = Range.Start; 8674 Plan->addVF(VF); 8675 RSO << "Initial VPlan for VF={" << VF; 8676 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8677 Plan->addVF(VF); 8678 RSO << "," << VF; 8679 } 8680 RSO << "},UF>=1"; 8681 RSO.flush(); 8682 Plan->setName(PlanName); 8683 8684 return Plan; 8685 } 8686 8687 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8688 // Outer loop handling: They may require CFG and instruction level 8689 // transformations before even evaluating whether vectorization is profitable. 8690 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8691 // the vectorization pipeline. 8692 assert(!OrigLoop->isInnermost()); 8693 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8694 8695 // Create new empty VPlan 8696 auto Plan = std::make_unique<VPlan>(); 8697 8698 // Build hierarchical CFG 8699 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8700 HCFGBuilder.buildHierarchicalCFG(); 8701 8702 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8703 VF *= 2) 8704 Plan->addVF(VF); 8705 8706 if (EnableVPlanPredication) { 8707 VPlanPredicator VPP(*Plan); 8708 VPP.predicate(); 8709 8710 // Avoid running transformation to recipes until masked code generation in 8711 // VPlan-native path is in place. 8712 return Plan; 8713 } 8714 8715 SmallPtrSet<Instruction *, 1> DeadInstructions; 8716 VPlanTransforms::VPInstructionsToVPRecipes( 8717 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8718 return Plan; 8719 } 8720 8721 // Adjust the recipes for any inloop reductions. The chain of instructions 8722 // leading from the loop exit instr to the phi need to be converted to 8723 // reductions, with one operand being vector and the other being the scalar 8724 // reduction chain. 8725 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8726 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8727 for (auto &Reduction : CM.getInLoopReductionChains()) { 8728 PHINode *Phi = Reduction.first; 8729 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8730 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8731 8732 // ReductionOperations are orders top-down from the phi's use to the 8733 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8734 // which of the two operands will remain scalar and which will be reduced. 8735 // For minmax the chain will be the select instructions. 8736 Instruction *Chain = Phi; 8737 for (Instruction *R : ReductionOperations) { 8738 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8739 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8740 8741 VPValue *ChainOp = Plan->getVPValue(Chain); 8742 unsigned FirstOpId; 8743 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8744 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8745 "Expected to replace a VPWidenSelectSC"); 8746 FirstOpId = 1; 8747 } else { 8748 assert(isa<VPWidenRecipe>(WidenRecipe) && 8749 "Expected to replace a VPWidenSC"); 8750 FirstOpId = 0; 8751 } 8752 unsigned VecOpId = 8753 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8754 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8755 8756 auto *CondOp = CM.foldTailByMasking() 8757 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8758 : nullptr; 8759 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8760 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8761 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8762 Plan->removeVPValueFor(R); 8763 Plan->addVPValue(R, RedRecipe); 8764 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8765 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8766 WidenRecipe->eraseFromParent(); 8767 8768 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8769 VPRecipeBase *CompareRecipe = 8770 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8771 assert(isa<VPWidenRecipe>(CompareRecipe) && 8772 "Expected to replace a VPWidenSC"); 8773 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8774 "Expected no remaining users"); 8775 CompareRecipe->eraseFromParent(); 8776 } 8777 Chain = R; 8778 } 8779 } 8780 } 8781 8782 Value* LoopVectorizationPlanner::VPCallbackILV:: 8783 getOrCreateVectorValues(Value *V, unsigned Part) { 8784 return ILV.getOrCreateVectorValue(V, Part); 8785 } 8786 8787 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8788 Value *V, const VPIteration &Instance) { 8789 return ILV.getOrCreateScalarValue(V, Instance); 8790 } 8791 8792 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8793 VPSlotTracker &SlotTracker) const { 8794 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8795 IG->getInsertPos()->printAsOperand(O, false); 8796 O << ", "; 8797 getAddr()->printAsOperand(O, SlotTracker); 8798 VPValue *Mask = getMask(); 8799 if (Mask) { 8800 O << ", "; 8801 Mask->printAsOperand(O, SlotTracker); 8802 } 8803 for (unsigned i = 0; i < IG->getFactor(); ++i) 8804 if (Instruction *I = IG->getMember(i)) 8805 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8806 } 8807 8808 void VPWidenCallRecipe::execute(VPTransformState &State) { 8809 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8810 *this, State); 8811 } 8812 8813 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8814 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8815 this, *this, InvariantCond, State); 8816 } 8817 8818 void VPWidenRecipe::execute(VPTransformState &State) { 8819 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8820 } 8821 8822 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8823 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8824 *this, State.UF, State.VF, IsPtrLoopInvariant, 8825 IsIndexLoopInvariant, State); 8826 } 8827 8828 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8829 assert(!State.Instance && "Int or FP induction being replicated."); 8830 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8831 Trunc); 8832 } 8833 8834 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8835 Value *StartV = 8836 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8837 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8838 } 8839 8840 void VPBlendRecipe::execute(VPTransformState &State) { 8841 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8842 // We know that all PHIs in non-header blocks are converted into 8843 // selects, so we don't have to worry about the insertion order and we 8844 // can just use the builder. 8845 // At this point we generate the predication tree. There may be 8846 // duplications since this is a simple recursive scan, but future 8847 // optimizations will clean it up. 8848 8849 unsigned NumIncoming = getNumIncomingValues(); 8850 8851 // Generate a sequence of selects of the form: 8852 // SELECT(Mask3, In3, 8853 // SELECT(Mask2, In2, 8854 // SELECT(Mask1, In1, 8855 // In0))) 8856 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8857 // are essentially undef are taken from In0. 8858 InnerLoopVectorizer::VectorParts Entry(State.UF); 8859 for (unsigned In = 0; In < NumIncoming; ++In) { 8860 for (unsigned Part = 0; Part < State.UF; ++Part) { 8861 // We might have single edge PHIs (blocks) - use an identity 8862 // 'select' for the first PHI operand. 8863 Value *In0 = State.get(getIncomingValue(In), Part); 8864 if (In == 0) 8865 Entry[Part] = In0; // Initialize with the first incoming value. 8866 else { 8867 // Select between the current value and the previous incoming edge 8868 // based on the incoming mask. 8869 Value *Cond = State.get(getMask(In), Part); 8870 Entry[Part] = 8871 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8872 } 8873 } 8874 } 8875 for (unsigned Part = 0; Part < State.UF; ++Part) 8876 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8877 } 8878 8879 void VPInterleaveRecipe::execute(VPTransformState &State) { 8880 assert(!State.Instance && "Interleave group being replicated."); 8881 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8882 getStoredValues(), getMask()); 8883 } 8884 8885 void VPReductionRecipe::execute(VPTransformState &State) { 8886 assert(!State.Instance && "Reduction being replicated."); 8887 for (unsigned Part = 0; Part < State.UF; ++Part) { 8888 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8889 Value *NewVecOp = State.get(getVecOp(), Part); 8890 if (VPValue *Cond = getCondOp()) { 8891 Value *NewCond = State.get(Cond, Part); 8892 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8893 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8894 Kind, VecTy->getElementType()); 8895 Constant *IdenVec = 8896 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8897 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8898 NewVecOp = Select; 8899 } 8900 Value *NewRed = 8901 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8902 Value *PrevInChain = State.get(getChainOp(), Part); 8903 Value *NextInChain; 8904 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8905 NextInChain = 8906 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8907 NewRed, PrevInChain); 8908 } else { 8909 NextInChain = State.Builder.CreateBinOp( 8910 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8911 PrevInChain); 8912 } 8913 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8914 } 8915 } 8916 8917 void VPReplicateRecipe::execute(VPTransformState &State) { 8918 if (State.Instance) { // Generate a single instance. 8919 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8920 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8921 *State.Instance, IsPredicated, State); 8922 // Insert scalar instance packing it into a vector. 8923 if (AlsoPack && State.VF.isVector()) { 8924 // If we're constructing lane 0, initialize to start from poison. 8925 if (State.Instance->Lane == 0) { 8926 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8927 Value *Poison = PoisonValue::get( 8928 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8929 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8930 State.Instance->Part, Poison); 8931 } 8932 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8933 *State.Instance); 8934 } 8935 return; 8936 } 8937 8938 // Generate scalar instances for all VF lanes of all UF parts, unless the 8939 // instruction is uniform inwhich case generate only the first lane for each 8940 // of the UF parts. 8941 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8942 assert((!State.VF.isScalable() || IsUniform) && 8943 "Can't scalarize a scalable vector"); 8944 for (unsigned Part = 0; Part < State.UF; ++Part) 8945 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8946 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8947 IsPredicated, State); 8948 } 8949 8950 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8951 assert(State.Instance && "Branch on Mask works only on single instance."); 8952 8953 unsigned Part = State.Instance->Part; 8954 unsigned Lane = State.Instance->Lane; 8955 8956 Value *ConditionBit = nullptr; 8957 VPValue *BlockInMask = getMask(); 8958 if (BlockInMask) { 8959 ConditionBit = State.get(BlockInMask, Part); 8960 if (ConditionBit->getType()->isVectorTy()) 8961 ConditionBit = State.Builder.CreateExtractElement( 8962 ConditionBit, State.Builder.getInt32(Lane)); 8963 } else // Block in mask is all-one. 8964 ConditionBit = State.Builder.getTrue(); 8965 8966 // Replace the temporary unreachable terminator with a new conditional branch, 8967 // whose two destinations will be set later when they are created. 8968 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8969 assert(isa<UnreachableInst>(CurrentTerminator) && 8970 "Expected to replace unreachable terminator with conditional branch."); 8971 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8972 CondBr->setSuccessor(0, nullptr); 8973 ReplaceInstWithInst(CurrentTerminator, CondBr); 8974 } 8975 8976 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8977 assert(State.Instance && "Predicated instruction PHI works per instance."); 8978 Instruction *ScalarPredInst = 8979 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8980 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8981 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8982 assert(PredicatingBB && "Predicated block has no single predecessor."); 8983 8984 // By current pack/unpack logic we need to generate only a single phi node: if 8985 // a vector value for the predicated instruction exists at this point it means 8986 // the instruction has vector users only, and a phi for the vector value is 8987 // needed. In this case the recipe of the predicated instruction is marked to 8988 // also do that packing, thereby "hoisting" the insert-element sequence. 8989 // Otherwise, a phi node for the scalar value is needed. 8990 unsigned Part = State.Instance->Part; 8991 Instruction *PredInst = 8992 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8993 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8994 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8995 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8996 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8997 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8998 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8999 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 9000 } else { 9001 Type *PredInstType = PredInst->getType(); 9002 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9003 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 9004 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9005 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 9006 } 9007 } 9008 9009 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9010 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9011 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9012 StoredValue ? nullptr : getVPValue(), 9013 getAddr(), StoredValue, getMask()); 9014 } 9015 9016 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9017 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9018 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9019 // for predication. 9020 static ScalarEpilogueLowering getScalarEpilogueLowering( 9021 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9022 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9023 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9024 LoopVectorizationLegality &LVL) { 9025 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9026 // don't look at hints or options, and don't request a scalar epilogue. 9027 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9028 // LoopAccessInfo (due to code dependency and not being able to reliably get 9029 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9030 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9031 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9032 // back to the old way and vectorize with versioning when forced. See D81345.) 9033 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9034 PGSOQueryType::IRPass) && 9035 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9036 return CM_ScalarEpilogueNotAllowedOptSize; 9037 9038 // 2) If set, obey the directives 9039 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9040 switch (PreferPredicateOverEpilogue) { 9041 case PreferPredicateTy::ScalarEpilogue: 9042 return CM_ScalarEpilogueAllowed; 9043 case PreferPredicateTy::PredicateElseScalarEpilogue: 9044 return CM_ScalarEpilogueNotNeededUsePredicate; 9045 case PreferPredicateTy::PredicateOrDontVectorize: 9046 return CM_ScalarEpilogueNotAllowedUsePredicate; 9047 }; 9048 } 9049 9050 // 3) If set, obey the hints 9051 switch (Hints.getPredicate()) { 9052 case LoopVectorizeHints::FK_Enabled: 9053 return CM_ScalarEpilogueNotNeededUsePredicate; 9054 case LoopVectorizeHints::FK_Disabled: 9055 return CM_ScalarEpilogueAllowed; 9056 }; 9057 9058 // 4) if the TTI hook indicates this is profitable, request predication. 9059 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9060 LVL.getLAI())) 9061 return CM_ScalarEpilogueNotNeededUsePredicate; 9062 9063 return CM_ScalarEpilogueAllowed; 9064 } 9065 9066 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9067 unsigned Part) { 9068 set(Def, V, Part); 9069 ILV->setVectorValue(IRDef, Part, V); 9070 } 9071 9072 // Process the loop in the VPlan-native vectorization path. This path builds 9073 // VPlan upfront in the vectorization pipeline, which allows to apply 9074 // VPlan-to-VPlan transformations from the very beginning without modifying the 9075 // input LLVM IR. 9076 static bool processLoopInVPlanNativePath( 9077 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9078 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9079 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9080 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9081 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9082 9083 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9084 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9085 return false; 9086 } 9087 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9088 Function *F = L->getHeader()->getParent(); 9089 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9090 9091 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9092 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9093 9094 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9095 &Hints, IAI); 9096 // Use the planner for outer loop vectorization. 9097 // TODO: CM is not used at this point inside the planner. Turn CM into an 9098 // optional argument if we don't need it in the future. 9099 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9100 9101 // Get user vectorization factor. 9102 ElementCount UserVF = Hints.getWidth(); 9103 9104 // Plan how to best vectorize, return the best VF and its cost. 9105 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9106 9107 // If we are stress testing VPlan builds, do not attempt to generate vector 9108 // code. Masked vector code generation support will follow soon. 9109 // Also, do not attempt to vectorize if no vector code will be produced. 9110 if (VPlanBuildStressTest || EnableVPlanPredication || 9111 VectorizationFactor::Disabled() == VF) 9112 return false; 9113 9114 LVP.setBestPlan(VF.Width, 1); 9115 9116 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9117 &CM, BFI, PSI); 9118 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9119 << L->getHeader()->getParent()->getName() << "\"\n"); 9120 LVP.executePlan(LB, DT); 9121 9122 // Mark the loop as already vectorized to avoid vectorizing again. 9123 Hints.setAlreadyVectorized(); 9124 9125 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9126 return true; 9127 } 9128 9129 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9130 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9131 !EnableLoopInterleaving), 9132 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9133 !EnableLoopVectorization) {} 9134 9135 bool LoopVectorizePass::processLoop(Loop *L) { 9136 assert((EnableVPlanNativePath || L->isInnermost()) && 9137 "VPlan-native path is not enabled. Only process inner loops."); 9138 9139 #ifndef NDEBUG 9140 const std::string DebugLocStr = getDebugLocString(L); 9141 #endif /* NDEBUG */ 9142 9143 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9144 << L->getHeader()->getParent()->getName() << "\" from " 9145 << DebugLocStr << "\n"); 9146 9147 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9148 9149 LLVM_DEBUG( 9150 dbgs() << "LV: Loop hints:" 9151 << " force=" 9152 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9153 ? "disabled" 9154 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9155 ? "enabled" 9156 : "?")) 9157 << " width=" << Hints.getWidth() 9158 << " unroll=" << Hints.getInterleave() << "\n"); 9159 9160 // Function containing loop 9161 Function *F = L->getHeader()->getParent(); 9162 9163 // Looking at the diagnostic output is the only way to determine if a loop 9164 // was vectorized (other than looking at the IR or machine code), so it 9165 // is important to generate an optimization remark for each loop. Most of 9166 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9167 // generated as OptimizationRemark and OptimizationRemarkMissed are 9168 // less verbose reporting vectorized loops and unvectorized loops that may 9169 // benefit from vectorization, respectively. 9170 9171 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9172 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9173 return false; 9174 } 9175 9176 PredicatedScalarEvolution PSE(*SE, *L); 9177 9178 // Check if it is legal to vectorize the loop. 9179 LoopVectorizationRequirements Requirements(*ORE); 9180 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9181 &Requirements, &Hints, DB, AC, BFI, PSI); 9182 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9183 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9184 Hints.emitRemarkWithHints(); 9185 return false; 9186 } 9187 9188 // Check the function attributes and profiles to find out if this function 9189 // should be optimized for size. 9190 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9191 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9192 9193 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9194 // here. They may require CFG and instruction level transformations before 9195 // even evaluating whether vectorization is profitable. Since we cannot modify 9196 // the incoming IR, we need to build VPlan upfront in the vectorization 9197 // pipeline. 9198 if (!L->isInnermost()) 9199 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9200 ORE, BFI, PSI, Hints); 9201 9202 assert(L->isInnermost() && "Inner loop expected."); 9203 9204 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9205 // count by optimizing for size, to minimize overheads. 9206 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9207 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9208 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9209 << "This loop is worth vectorizing only if no scalar " 9210 << "iteration overheads are incurred."); 9211 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9212 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9213 else { 9214 LLVM_DEBUG(dbgs() << "\n"); 9215 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9216 } 9217 } 9218 9219 // Check the function attributes to see if implicit floats are allowed. 9220 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9221 // an integer loop and the vector instructions selected are purely integer 9222 // vector instructions? 9223 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9224 reportVectorizationFailure( 9225 "Can't vectorize when the NoImplicitFloat attribute is used", 9226 "loop not vectorized due to NoImplicitFloat attribute", 9227 "NoImplicitFloat", ORE, L); 9228 Hints.emitRemarkWithHints(); 9229 return false; 9230 } 9231 9232 // Check if the target supports potentially unsafe FP vectorization. 9233 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9234 // for the target we're vectorizing for, to make sure none of the 9235 // additional fp-math flags can help. 9236 if (Hints.isPotentiallyUnsafe() && 9237 TTI->isFPVectorizationPotentiallyUnsafe()) { 9238 reportVectorizationFailure( 9239 "Potentially unsafe FP op prevents vectorization", 9240 "loop not vectorized due to unsafe FP support.", 9241 "UnsafeFP", ORE, L); 9242 Hints.emitRemarkWithHints(); 9243 return false; 9244 } 9245 9246 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9247 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9248 9249 // If an override option has been passed in for interleaved accesses, use it. 9250 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9251 UseInterleaved = EnableInterleavedMemAccesses; 9252 9253 // Analyze interleaved memory accesses. 9254 if (UseInterleaved) { 9255 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9256 } 9257 9258 // Use the cost model. 9259 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9260 F, &Hints, IAI); 9261 CM.collectValuesToIgnore(); 9262 9263 // Use the planner for vectorization. 9264 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9265 9266 // Get user vectorization factor and interleave count. 9267 ElementCount UserVF = Hints.getWidth(); 9268 unsigned UserIC = Hints.getInterleave(); 9269 9270 // Plan how to best vectorize, return the best VF and its cost. 9271 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9272 9273 VectorizationFactor VF = VectorizationFactor::Disabled(); 9274 unsigned IC = 1; 9275 9276 if (MaybeVF) { 9277 VF = *MaybeVF; 9278 // Select the interleave count. 9279 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9280 } 9281 9282 // Identify the diagnostic messages that should be produced. 9283 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9284 bool VectorizeLoop = true, InterleaveLoop = true; 9285 if (Requirements.doesNotMeet(F, L, Hints)) { 9286 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9287 "requirements.\n"); 9288 Hints.emitRemarkWithHints(); 9289 return false; 9290 } 9291 9292 if (VF.Width.isScalar()) { 9293 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9294 VecDiagMsg = std::make_pair( 9295 "VectorizationNotBeneficial", 9296 "the cost-model indicates that vectorization is not beneficial"); 9297 VectorizeLoop = false; 9298 } 9299 9300 if (!MaybeVF && UserIC > 1) { 9301 // Tell the user interleaving was avoided up-front, despite being explicitly 9302 // requested. 9303 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9304 "interleaving should be avoided up front\n"); 9305 IntDiagMsg = std::make_pair( 9306 "InterleavingAvoided", 9307 "Ignoring UserIC, because interleaving was avoided up front"); 9308 InterleaveLoop = false; 9309 } else if (IC == 1 && UserIC <= 1) { 9310 // Tell the user interleaving is not beneficial. 9311 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9312 IntDiagMsg = std::make_pair( 9313 "InterleavingNotBeneficial", 9314 "the cost-model indicates that interleaving is not beneficial"); 9315 InterleaveLoop = false; 9316 if (UserIC == 1) { 9317 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9318 IntDiagMsg.second += 9319 " and is explicitly disabled or interleave count is set to 1"; 9320 } 9321 } else if (IC > 1 && UserIC == 1) { 9322 // Tell the user interleaving is beneficial, but it explicitly disabled. 9323 LLVM_DEBUG( 9324 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9325 IntDiagMsg = std::make_pair( 9326 "InterleavingBeneficialButDisabled", 9327 "the cost-model indicates that interleaving is beneficial " 9328 "but is explicitly disabled or interleave count is set to 1"); 9329 InterleaveLoop = false; 9330 } 9331 9332 // Override IC if user provided an interleave count. 9333 IC = UserIC > 0 ? UserIC : IC; 9334 9335 // Emit diagnostic messages, if any. 9336 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9337 if (!VectorizeLoop && !InterleaveLoop) { 9338 // Do not vectorize or interleaving the loop. 9339 ORE->emit([&]() { 9340 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9341 L->getStartLoc(), L->getHeader()) 9342 << VecDiagMsg.second; 9343 }); 9344 ORE->emit([&]() { 9345 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9346 L->getStartLoc(), L->getHeader()) 9347 << IntDiagMsg.second; 9348 }); 9349 return false; 9350 } else if (!VectorizeLoop && InterleaveLoop) { 9351 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9352 ORE->emit([&]() { 9353 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9354 L->getStartLoc(), L->getHeader()) 9355 << VecDiagMsg.second; 9356 }); 9357 } else if (VectorizeLoop && !InterleaveLoop) { 9358 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9359 << ") in " << DebugLocStr << '\n'); 9360 ORE->emit([&]() { 9361 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9362 L->getStartLoc(), L->getHeader()) 9363 << IntDiagMsg.second; 9364 }); 9365 } else if (VectorizeLoop && InterleaveLoop) { 9366 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9367 << ") in " << DebugLocStr << '\n'); 9368 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9369 } 9370 9371 LVP.setBestPlan(VF.Width, IC); 9372 9373 using namespace ore; 9374 bool DisableRuntimeUnroll = false; 9375 MDNode *OrigLoopID = L->getLoopID(); 9376 9377 if (!VectorizeLoop) { 9378 assert(IC > 1 && "interleave count should not be 1 or 0"); 9379 // If we decided that it is not legal to vectorize the loop, then 9380 // interleave it. 9381 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9382 BFI, PSI); 9383 LVP.executePlan(Unroller, DT); 9384 9385 ORE->emit([&]() { 9386 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9387 L->getHeader()) 9388 << "interleaved loop (interleaved count: " 9389 << NV("InterleaveCount", IC) << ")"; 9390 }); 9391 } else { 9392 // If we decided that it is *legal* to vectorize the loop, then do it. 9393 9394 // Consider vectorizing the epilogue too if it's profitable. 9395 VectorizationFactor EpilogueVF = 9396 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9397 if (EpilogueVF.Width.isVector()) { 9398 9399 // The first pass vectorizes the main loop and creates a scalar epilogue 9400 // to be vectorized by executing the plan (potentially with a different 9401 // factor) again shortly afterwards. 9402 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9403 EpilogueVF.Width.getKnownMinValue(), 1); 9404 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9405 &LVL, &CM, BFI, PSI); 9406 9407 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9408 LVP.executePlan(MainILV, DT); 9409 ++LoopsVectorized; 9410 9411 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9412 formLCSSARecursively(*L, *DT, LI, SE); 9413 9414 // Second pass vectorizes the epilogue and adjusts the control flow 9415 // edges from the first pass. 9416 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9417 EPI.MainLoopVF = EPI.EpilogueVF; 9418 EPI.MainLoopUF = EPI.EpilogueUF; 9419 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9420 ORE, EPI, &LVL, &CM, BFI, PSI); 9421 LVP.executePlan(EpilogILV, DT); 9422 ++LoopsEpilogueVectorized; 9423 9424 if (!MainILV.areSafetyChecksAdded()) 9425 DisableRuntimeUnroll = true; 9426 } else { 9427 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9428 &LVL, &CM, BFI, PSI); 9429 LVP.executePlan(LB, DT); 9430 ++LoopsVectorized; 9431 9432 // Add metadata to disable runtime unrolling a scalar loop when there are 9433 // no runtime checks about strides and memory. A scalar loop that is 9434 // rarely used is not worth unrolling. 9435 if (!LB.areSafetyChecksAdded()) 9436 DisableRuntimeUnroll = true; 9437 } 9438 9439 // Report the vectorization decision. 9440 ORE->emit([&]() { 9441 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9442 L->getHeader()) 9443 << "vectorized loop (vectorization width: " 9444 << NV("VectorizationFactor", VF.Width) 9445 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9446 }); 9447 } 9448 9449 Optional<MDNode *> RemainderLoopID = 9450 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9451 LLVMLoopVectorizeFollowupEpilogue}); 9452 if (RemainderLoopID.hasValue()) { 9453 L->setLoopID(RemainderLoopID.getValue()); 9454 } else { 9455 if (DisableRuntimeUnroll) 9456 AddRuntimeUnrollDisableMetaData(L); 9457 9458 // Mark the loop as already vectorized to avoid vectorizing again. 9459 Hints.setAlreadyVectorized(); 9460 } 9461 9462 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9463 return true; 9464 } 9465 9466 LoopVectorizeResult LoopVectorizePass::runImpl( 9467 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9468 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9469 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9470 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9471 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9472 SE = &SE_; 9473 LI = &LI_; 9474 TTI = &TTI_; 9475 DT = &DT_; 9476 BFI = &BFI_; 9477 TLI = TLI_; 9478 AA = &AA_; 9479 AC = &AC_; 9480 GetLAA = &GetLAA_; 9481 DB = &DB_; 9482 ORE = &ORE_; 9483 PSI = PSI_; 9484 9485 // Don't attempt if 9486 // 1. the target claims to have no vector registers, and 9487 // 2. interleaving won't help ILP. 9488 // 9489 // The second condition is necessary because, even if the target has no 9490 // vector registers, loop vectorization may still enable scalar 9491 // interleaving. 9492 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9493 TTI->getMaxInterleaveFactor(1) < 2) 9494 return LoopVectorizeResult(false, false); 9495 9496 bool Changed = false, CFGChanged = false; 9497 9498 // The vectorizer requires loops to be in simplified form. 9499 // Since simplification may add new inner loops, it has to run before the 9500 // legality and profitability checks. This means running the loop vectorizer 9501 // will simplify all loops, regardless of whether anything end up being 9502 // vectorized. 9503 for (auto &L : *LI) 9504 Changed |= CFGChanged |= 9505 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9506 9507 // Build up a worklist of inner-loops to vectorize. This is necessary as 9508 // the act of vectorizing or partially unrolling a loop creates new loops 9509 // and can invalidate iterators across the loops. 9510 SmallVector<Loop *, 8> Worklist; 9511 9512 for (Loop *L : *LI) 9513 collectSupportedLoops(*L, LI, ORE, Worklist); 9514 9515 LoopsAnalyzed += Worklist.size(); 9516 9517 // Now walk the identified inner loops. 9518 while (!Worklist.empty()) { 9519 Loop *L = Worklist.pop_back_val(); 9520 9521 // For the inner loops we actually process, form LCSSA to simplify the 9522 // transform. 9523 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9524 9525 Changed |= CFGChanged |= processLoop(L); 9526 } 9527 9528 // Process each loop nest in the function. 9529 return LoopVectorizeResult(Changed, CFGChanged); 9530 } 9531 9532 PreservedAnalyses LoopVectorizePass::run(Function &F, 9533 FunctionAnalysisManager &AM) { 9534 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9535 auto &LI = AM.getResult<LoopAnalysis>(F); 9536 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9537 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9538 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9539 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9540 auto &AA = AM.getResult<AAManager>(F); 9541 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9542 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9543 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9544 MemorySSA *MSSA = EnableMSSALoopDependency 9545 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9546 : nullptr; 9547 9548 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9549 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9550 [&](Loop &L) -> const LoopAccessInfo & { 9551 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9552 TLI, TTI, nullptr, MSSA}; 9553 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9554 }; 9555 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9556 ProfileSummaryInfo *PSI = 9557 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9558 LoopVectorizeResult Result = 9559 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9560 if (!Result.MadeAnyChange) 9561 return PreservedAnalyses::all(); 9562 PreservedAnalyses PA; 9563 9564 // We currently do not preserve loopinfo/dominator analyses with outer loop 9565 // vectorization. Until this is addressed, mark these analyses as preserved 9566 // only for non-VPlan-native path. 9567 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9568 if (!EnableVPlanNativePath) { 9569 PA.preserve<LoopAnalysis>(); 9570 PA.preserve<DominatorTreeAnalysis>(); 9571 } 9572 PA.preserve<BasicAA>(); 9573 PA.preserve<GlobalsAA>(); 9574 if (!Result.MadeCFGChange) 9575 PA.preserveSet<CFGAnalyses>(); 9576 return PA; 9577 } 9578