1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 407 if (isa<FPMathOperator>(V)) 408 cast<Instruction>(V)->setFastMathFlags(FMF); 409 return V; 410 } 411 412 /// A helper function that returns an integer or floating-point constant with 413 /// value C. 414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 415 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 416 : ConstantFP::get(Ty, C); 417 } 418 419 /// Returns "best known" trip count for the specified loop \p L as defined by 420 /// the following procedure: 421 /// 1) Returns exact trip count if it is known. 422 /// 2) Returns expected trip count according to profile data if any. 423 /// 3) Returns upper bound estimate if it is known. 424 /// 4) Returns None if all of the above failed. 425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return None; 440 } 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), 471 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 472 BFI(BFI), PSI(PSI) { 473 // Query this against the original loop and save it here because the profile 474 // of the original loop header may change as the transformation happens. 475 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 476 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 477 } 478 479 virtual ~InnerLoopVectorizer() = default; 480 481 /// Create a new empty loop that will contain vectorized instructions later 482 /// on, while the old loop will be used as the scalar remainder. Control flow 483 /// is generated around the vectorized (and scalar epilogue) loops consisting 484 /// of various checks and bypasses. Return the pre-header block of the new 485 /// loop. 486 /// In the case of epilogue vectorization, this function is overriden to 487 /// handle the more complex control flow around the loops. 488 virtual BasicBlock *createVectorizedLoopSkeleton(); 489 490 /// Widen a single instruction within the innermost loop. 491 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 492 VPTransformState &State); 493 494 /// Widen a single call instruction within the innermost loop. 495 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 496 VPTransformState &State); 497 498 /// Widen a single select instruction within the innermost loop. 499 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 500 bool InvariantCond, VPTransformState &State); 501 502 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 503 void fixVectorizedLoop(); 504 505 // Return true if any runtime check is added. 506 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 507 508 /// A type for vectorized values in the new loop. Each value from the 509 /// original loop, when vectorized, is represented by UF vector values in the 510 /// new unrolled loop, where UF is the unroll factor. 511 using VectorParts = SmallVector<Value *, 2>; 512 513 /// Vectorize a single GetElementPtrInst based on information gathered and 514 /// decisions taken during planning. 515 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 516 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 517 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 518 519 /// Vectorize a single PHINode in a block. This method handles the induction 520 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 521 /// arbitrary length vectors. 522 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 523 Value *StartV, unsigned UF, ElementCount VF); 524 525 /// A helper function to scalarize a single Instruction in the innermost loop. 526 /// Generates a sequence of scalar instances for each lane between \p MinLane 527 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 528 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 529 /// Instr's operands. 530 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 531 const VPIteration &Instance, bool IfPredicateInstr, 532 VPTransformState &State); 533 534 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 535 /// is provided, the integer induction variable will first be truncated to 536 /// the corresponding type. 537 void widenIntOrFpInduction(PHINode *IV, Value *Start, 538 TruncInst *Trunc = nullptr); 539 540 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 541 /// vector or scalar value on-demand if one is not yet available. When 542 /// vectorizing a loop, we visit the definition of an instruction before its 543 /// uses. When visiting the definition, we either vectorize or scalarize the 544 /// instruction, creating an entry for it in the corresponding map. (In some 545 /// cases, such as induction variables, we will create both vector and scalar 546 /// entries.) Then, as we encounter uses of the definition, we derive values 547 /// for each scalar or vector use unless such a value is already available. 548 /// For example, if we scalarize a definition and one of its uses is vector, 549 /// we build the required vector on-demand with an insertelement sequence 550 /// when visiting the use. Otherwise, if the use is scalar, we can use the 551 /// existing scalar definition. 552 /// 553 /// Return a value in the new loop corresponding to \p V from the original 554 /// loop at unroll index \p Part. If the value has already been vectorized, 555 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 556 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 557 /// a new vector value on-demand by inserting the scalar values into a vector 558 /// with an insertelement sequence. If the value has been neither vectorized 559 /// nor scalarized, it must be loop invariant, so we simply broadcast the 560 /// value into a vector. 561 Value *getOrCreateVectorValue(Value *V, unsigned Part); 562 563 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 564 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 565 } 566 567 /// Return a value in the new loop corresponding to \p V from the original 568 /// loop at unroll and vector indices \p Instance. If the value has been 569 /// vectorized but not scalarized, the necessary extractelement instruction 570 /// will be generated. 571 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 572 573 /// Construct the vector value of a scalarized value \p V one lane at a time. 574 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 575 576 /// Try to vectorize interleaved access group \p Group with the base address 577 /// given in \p Addr, optionally masking the vector operations if \p 578 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 579 /// values in the vectorized loop. 580 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 581 ArrayRef<VPValue *> VPDefs, 582 VPTransformState &State, VPValue *Addr, 583 ArrayRef<VPValue *> StoredValues, 584 VPValue *BlockInMask = nullptr); 585 586 /// Vectorize Load and Store instructions with the base address given in \p 587 /// Addr, optionally masking the vector operations if \p BlockInMask is 588 /// non-null. Use \p State to translate given VPValues to IR values in the 589 /// vectorized loop. 590 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 591 VPValue *Def, VPValue *Addr, 592 VPValue *StoredValue, VPValue *BlockInMask); 593 594 /// Set the debug location in the builder using the debug location in 595 /// the instruction. 596 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 597 598 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 599 void fixNonInductionPHIs(void); 600 601 protected: 602 friend class LoopVectorizationPlanner; 603 604 /// A small list of PHINodes. 605 using PhiVector = SmallVector<PHINode *, 4>; 606 607 /// A type for scalarized values in the new loop. Each value from the 608 /// original loop, when scalarized, is represented by UF x VF scalar values 609 /// in the new unrolled loop, where UF is the unroll factor and VF is the 610 /// vectorization factor. 611 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 612 613 /// Set up the values of the IVs correctly when exiting the vector loop. 614 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 615 Value *CountRoundDown, Value *EndValue, 616 BasicBlock *MiddleBlock); 617 618 /// Create a new induction variable inside L. 619 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 620 Value *Step, Instruction *DL); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(); 624 625 /// Fix a first-order recurrence. This is the second phase of vectorizing 626 /// this phi node. 627 void fixFirstOrderRecurrence(PHINode *Phi); 628 629 /// Fix a reduction cross-iteration phi. This is the second phase of 630 /// vectorizing this phi node. 631 void fixReduction(PHINode *Phi); 632 633 /// Clear NSW/NUW flags from reduction instructions if necessary. 634 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 635 636 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 637 /// means we need to add the appropriate incoming value from the middle 638 /// block as exiting edges from the scalar epilogue loop (if present) are 639 /// already in place, and we exit the vector loop exclusively to the middle 640 /// block. 641 void fixLCSSAPHIs(); 642 643 /// Iteratively sink the scalarized operands of a predicated instruction into 644 /// the block that was created for it. 645 void sinkScalarOperands(Instruction *PredInst); 646 647 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 648 /// represented as. 649 void truncateToMinimalBitwidths(); 650 651 /// Create a broadcast instruction. This method generates a broadcast 652 /// instruction (shuffle) for loop invariant values and for the induction 653 /// value. If this is the induction variable then we extend it to N, N+1, ... 654 /// this is needed because each iteration in the loop corresponds to a SIMD 655 /// element. 656 virtual Value *getBroadcastInstrs(Value *V); 657 658 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 659 /// to each vector element of Val. The sequence starts at StartIndex. 660 /// \p Opcode is relevant for FP induction variable. 661 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 662 Instruction::BinaryOps Opcode = 663 Instruction::BinaryOpsEnd); 664 665 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 666 /// variable on which to base the steps, \p Step is the size of the step, and 667 /// \p EntryVal is the value from the original loop that maps to the steps. 668 /// Note that \p EntryVal doesn't have to be an induction variable - it 669 /// can also be a truncate instruction. 670 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 671 const InductionDescriptor &ID); 672 673 /// Create a vector induction phi node based on an existing scalar one. \p 674 /// EntryVal is the value from the original loop that maps to the vector phi 675 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 676 /// truncate instruction, instead of widening the original IV, we widen a 677 /// version of the IV truncated to \p EntryVal's type. 678 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 679 Value *Step, Value *Start, 680 Instruction *EntryVal); 681 682 /// Returns true if an instruction \p I should be scalarized instead of 683 /// vectorized for the chosen vectorization factor. 684 bool shouldScalarizeInstruction(Instruction *I) const; 685 686 /// Returns true if we should generate a scalar version of \p IV. 687 bool needsScalarInduction(Instruction *IV) const; 688 689 /// If there is a cast involved in the induction variable \p ID, which should 690 /// be ignored in the vectorized loop body, this function records the 691 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 692 /// cast. We had already proved that the casted Phi is equal to the uncasted 693 /// Phi in the vectorized loop (under a runtime guard), and therefore 694 /// there is no need to vectorize the cast - the same value can be used in the 695 /// vector loop for both the Phi and the cast. 696 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 697 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 698 /// 699 /// \p EntryVal is the value from the original loop that maps to the vector 700 /// phi node and is used to distinguish what is the IV currently being 701 /// processed - original one (if \p EntryVal is a phi corresponding to the 702 /// original IV) or the "newly-created" one based on the proof mentioned above 703 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 704 /// latter case \p EntryVal is a TruncInst and we must not record anything for 705 /// that IV, but it's error-prone to expect callers of this routine to care 706 /// about that, hence this explicit parameter. 707 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 708 const Instruction *EntryVal, 709 Value *VectorLoopValue, 710 unsigned Part, 711 unsigned Lane = UINT_MAX); 712 713 /// Generate a shuffle sequence that will reverse the vector Vec. 714 virtual Value *reverseVector(Value *Vec); 715 716 /// Returns (and creates if needed) the original loop trip count. 717 Value *getOrCreateTripCount(Loop *NewLoop); 718 719 /// Returns (and creates if needed) the trip count of the widened loop. 720 Value *getOrCreateVectorTripCount(Loop *NewLoop); 721 722 /// Returns a bitcasted value to the requested vector type. 723 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 724 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 725 const DataLayout &DL); 726 727 /// Emit a bypass check to see if the vector trip count is zero, including if 728 /// it overflows. 729 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 730 731 /// Emit a bypass check to see if all of the SCEV assumptions we've 732 /// had to make are correct. 733 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 734 735 /// Emit bypass checks to check any memory assumptions we may have made. 736 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 737 738 /// Compute the transformed value of Index at offset StartValue using step 739 /// StepValue. 740 /// For integer induction, returns StartValue + Index * StepValue. 741 /// For pointer induction, returns StartValue[Index * StepValue]. 742 /// FIXME: The newly created binary instructions should contain nsw/nuw 743 /// flags, which can be found from the original scalar operations. 744 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 745 const DataLayout &DL, 746 const InductionDescriptor &ID) const; 747 748 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 749 /// vector loop preheader, middle block and scalar preheader. Also 750 /// allocate a loop object for the new vector loop and return it. 751 Loop *createVectorLoopSkeleton(StringRef Prefix); 752 753 /// Create new phi nodes for the induction variables to resume iteration count 754 /// in the scalar epilogue, from where the vectorized loop left off (given by 755 /// \p VectorTripCount). 756 /// In cases where the loop skeleton is more complicated (eg. epilogue 757 /// vectorization) and the resume values can come from an additional bypass 758 /// block, the \p AdditionalBypass pair provides information about the bypass 759 /// block and the end value on the edge from bypass to this loop. 760 void createInductionResumeValues( 761 Loop *L, Value *VectorTripCount, 762 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 763 764 /// Complete the loop skeleton by adding debug MDs, creating appropriate 765 /// conditional branches in the middle block, preparing the builder and 766 /// running the verifier. Take in the vector loop \p L as argument, and return 767 /// the preheader of the completed vector loop. 768 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 769 770 /// Add additional metadata to \p To that was not present on \p Orig. 771 /// 772 /// Currently this is used to add the noalias annotations based on the 773 /// inserted memchecks. Use this for instructions that are *cloned* into the 774 /// vector loop. 775 void addNewMetadata(Instruction *To, const Instruction *Orig); 776 777 /// Add metadata from one instruction to another. 778 /// 779 /// This includes both the original MDs from \p From and additional ones (\see 780 /// addNewMetadata). Use this for *newly created* instructions in the vector 781 /// loop. 782 void addMetadata(Instruction *To, Instruction *From); 783 784 /// Similar to the previous function but it adds the metadata to a 785 /// vector of instructions. 786 void addMetadata(ArrayRef<Value *> To, Instruction *From); 787 788 /// Allow subclasses to override and print debug traces before/after vplan 789 /// execution, when trace information is requested. 790 virtual void printDebugTracesAtStart(){}; 791 virtual void printDebugTracesAtEnd(){}; 792 793 /// The original loop. 794 Loop *OrigLoop; 795 796 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 797 /// dynamic knowledge to simplify SCEV expressions and converts them to a 798 /// more usable form. 799 PredicatedScalarEvolution &PSE; 800 801 /// Loop Info. 802 LoopInfo *LI; 803 804 /// Dominator Tree. 805 DominatorTree *DT; 806 807 /// Alias Analysis. 808 AAResults *AA; 809 810 /// Target Library Info. 811 const TargetLibraryInfo *TLI; 812 813 /// Target Transform Info. 814 const TargetTransformInfo *TTI; 815 816 /// Assumption Cache. 817 AssumptionCache *AC; 818 819 /// Interface to emit optimization remarks. 820 OptimizationRemarkEmitter *ORE; 821 822 /// LoopVersioning. It's only set up (non-null) if memchecks were 823 /// used. 824 /// 825 /// This is currently only used to add no-alias metadata based on the 826 /// memchecks. The actually versioning is performed manually. 827 std::unique_ptr<LoopVersioning> LVer; 828 829 /// The vectorization SIMD factor to use. Each vector will have this many 830 /// vector elements. 831 ElementCount VF; 832 833 /// The vectorization unroll factor to use. Each scalar is vectorized to this 834 /// many different vector instructions. 835 unsigned UF; 836 837 /// The builder that we use 838 IRBuilder<> Builder; 839 840 // --- Vectorization state --- 841 842 /// The vector-loop preheader. 843 BasicBlock *LoopVectorPreHeader; 844 845 /// The scalar-loop preheader. 846 BasicBlock *LoopScalarPreHeader; 847 848 /// Middle Block between the vector and the scalar. 849 BasicBlock *LoopMiddleBlock; 850 851 /// The (unique) ExitBlock of the scalar loop. Note that 852 /// there can be multiple exiting edges reaching this block. 853 BasicBlock *LoopExitBlock; 854 855 /// The vector loop body. 856 BasicBlock *LoopVectorBody; 857 858 /// The scalar loop body. 859 BasicBlock *LoopScalarBody; 860 861 /// A list of all bypass blocks. The first block is the entry of the loop. 862 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 863 864 /// The new Induction variable which was added to the new block. 865 PHINode *Induction = nullptr; 866 867 /// The induction variable of the old basic block. 868 PHINode *OldInduction = nullptr; 869 870 /// Maps values from the original loop to their corresponding values in the 871 /// vectorized loop. A key value can map to either vector values, scalar 872 /// values or both kinds of values, depending on whether the key was 873 /// vectorized and scalarized. 874 VectorizerValueMap VectorLoopValueMap; 875 876 /// Store instructions that were predicated. 877 SmallVector<Instruction *, 4> PredicatedInstructions; 878 879 /// Trip count of the original loop. 880 Value *TripCount = nullptr; 881 882 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 883 Value *VectorTripCount = nullptr; 884 885 /// The legality analysis. 886 LoopVectorizationLegality *Legal; 887 888 /// The profitablity analysis. 889 LoopVectorizationCostModel *Cost; 890 891 // Record whether runtime checks are added. 892 bool AddedSafetyChecks = false; 893 894 // Holds the end values for each induction variable. We save the end values 895 // so we can later fix-up the external users of the induction variables. 896 DenseMap<PHINode *, Value *> IVEndValues; 897 898 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 899 // fixed up at the end of vector code generation. 900 SmallVector<PHINode *, 8> OrigPHIsToFix; 901 902 /// BFI and PSI are used to check for profile guided size optimizations. 903 BlockFrequencyInfo *BFI; 904 ProfileSummaryInfo *PSI; 905 906 // Whether this loop should be optimized for size based on profile guided size 907 // optimizatios. 908 bool OptForSizeBasedOnProfile; 909 }; 910 911 class InnerLoopUnroller : public InnerLoopVectorizer { 912 public: 913 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 914 LoopInfo *LI, DominatorTree *DT, 915 const TargetLibraryInfo *TLI, 916 const TargetTransformInfo *TTI, AssumptionCache *AC, 917 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 918 LoopVectorizationLegality *LVL, 919 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 920 ProfileSummaryInfo *PSI) 921 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 922 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 923 BFI, PSI) {} 924 925 private: 926 Value *getBroadcastInstrs(Value *V) override; 927 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 928 Instruction::BinaryOps Opcode = 929 Instruction::BinaryOpsEnd) override; 930 Value *reverseVector(Value *Vec) override; 931 }; 932 933 /// Encapsulate information regarding vectorization of a loop and its epilogue. 934 /// This information is meant to be updated and used across two stages of 935 /// epilogue vectorization. 936 struct EpilogueLoopVectorizationInfo { 937 ElementCount MainLoopVF = ElementCount::getFixed(0); 938 unsigned MainLoopUF = 0; 939 ElementCount EpilogueVF = ElementCount::getFixed(0); 940 unsigned EpilogueUF = 0; 941 BasicBlock *MainLoopIterationCountCheck = nullptr; 942 BasicBlock *EpilogueIterationCountCheck = nullptr; 943 BasicBlock *SCEVSafetyCheck = nullptr; 944 BasicBlock *MemSafetyCheck = nullptr; 945 Value *TripCount = nullptr; 946 Value *VectorTripCount = nullptr; 947 948 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 949 unsigned EUF) 950 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 951 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 952 assert(EUF == 1 && 953 "A high UF for the epilogue loop is likely not beneficial."); 954 } 955 }; 956 957 /// An extension of the inner loop vectorizer that creates a skeleton for a 958 /// vectorized loop that has its epilogue (residual) also vectorized. 959 /// The idea is to run the vplan on a given loop twice, firstly to setup the 960 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 961 /// from the first step and vectorize the epilogue. This is achieved by 962 /// deriving two concrete strategy classes from this base class and invoking 963 /// them in succession from the loop vectorizer planner. 964 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 965 public: 966 InnerLoopAndEpilogueVectorizer( 967 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 968 DominatorTree *DT, const TargetLibraryInfo *TLI, 969 const TargetTransformInfo *TTI, AssumptionCache *AC, 970 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 971 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 972 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 973 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 974 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 975 EPI(EPI) {} 976 977 // Override this function to handle the more complex control flow around the 978 // three loops. 979 BasicBlock *createVectorizedLoopSkeleton() final override { 980 return createEpilogueVectorizedLoopSkeleton(); 981 } 982 983 /// The interface for creating a vectorized skeleton using one of two 984 /// different strategies, each corresponding to one execution of the vplan 985 /// as described above. 986 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 987 988 /// Holds and updates state information required to vectorize the main loop 989 /// and its epilogue in two separate passes. This setup helps us avoid 990 /// regenerating and recomputing runtime safety checks. It also helps us to 991 /// shorten the iteration-count-check path length for the cases where the 992 /// iteration count of the loop is so small that the main vector loop is 993 /// completely skipped. 994 EpilogueLoopVectorizationInfo &EPI; 995 }; 996 997 /// A specialized derived class of inner loop vectorizer that performs 998 /// vectorization of *main* loops in the process of vectorizing loops and their 999 /// epilogues. 1000 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerMainLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1009 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1010 EPI, LVL, CM, BFI, PSI) {} 1011 /// Implements the interface for creating a vectorized skeleton using the 1012 /// *main loop* strategy (ie the first pass of vplan execution). 1013 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1014 1015 protected: 1016 /// Emits an iteration count bypass check once for the main loop (when \p 1017 /// ForEpilogue is false) and once for the epilogue loop (when \p 1018 /// ForEpilogue is true). 1019 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1020 bool ForEpilogue); 1021 void printDebugTracesAtStart() override; 1022 void printDebugTracesAtEnd() override; 1023 }; 1024 1025 // A specialized derived class of inner loop vectorizer that performs 1026 // vectorization of *epilogue* loops in the process of vectorizing loops and 1027 // their epilogues. 1028 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1029 public: 1030 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1031 LoopInfo *LI, DominatorTree *DT, 1032 const TargetLibraryInfo *TLI, 1033 const TargetTransformInfo *TTI, AssumptionCache *AC, 1034 OptimizationRemarkEmitter *ORE, 1035 EpilogueLoopVectorizationInfo &EPI, 1036 LoopVectorizationLegality *LVL, 1037 llvm::LoopVectorizationCostModel *CM, 1038 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1039 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1040 EPI, LVL, CM, BFI, PSI) {} 1041 /// Implements the interface for creating a vectorized skeleton using the 1042 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1043 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1044 1045 protected: 1046 /// Emits an iteration count bypass check after the main vector loop has 1047 /// finished to see if there are any iterations left to execute by either 1048 /// the vector epilogue or the scalar epilogue. 1049 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1050 BasicBlock *Bypass, 1051 BasicBlock *Insert); 1052 void printDebugTracesAtStart() override; 1053 void printDebugTracesAtEnd() override; 1054 }; 1055 } // end namespace llvm 1056 1057 /// Look for a meaningful debug location on the instruction or it's 1058 /// operands. 1059 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1060 if (!I) 1061 return I; 1062 1063 DebugLoc Empty; 1064 if (I->getDebugLoc() != Empty) 1065 return I; 1066 1067 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1068 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1069 if (OpInst->getDebugLoc() != Empty) 1070 return OpInst; 1071 } 1072 1073 return I; 1074 } 1075 1076 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1077 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1078 const DILocation *DIL = Inst->getDebugLoc(); 1079 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1080 !isa<DbgInfoIntrinsic>(Inst)) { 1081 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1082 auto NewDIL = 1083 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1084 if (NewDIL) 1085 B.SetCurrentDebugLocation(NewDIL.getValue()); 1086 else 1087 LLVM_DEBUG(dbgs() 1088 << "Failed to create new discriminator: " 1089 << DIL->getFilename() << " Line: " << DIL->getLine()); 1090 } 1091 else 1092 B.SetCurrentDebugLocation(DIL); 1093 } else 1094 B.SetCurrentDebugLocation(DebugLoc()); 1095 } 1096 1097 /// Write a record \p DebugMsg about vectorization failure to the debug 1098 /// output stream. If \p I is passed, it is an instruction that prevents 1099 /// vectorization. 1100 #ifndef NDEBUG 1101 static void debugVectorizationFailure(const StringRef DebugMsg, 1102 Instruction *I) { 1103 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1104 if (I != nullptr) 1105 dbgs() << " " << *I; 1106 else 1107 dbgs() << '.'; 1108 dbgs() << '\n'; 1109 } 1110 #endif 1111 1112 /// Create an analysis remark that explains why vectorization failed 1113 /// 1114 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1115 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1116 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1117 /// the location of the remark. \return the remark object that can be 1118 /// streamed to. 1119 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1120 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1121 Value *CodeRegion = TheLoop->getHeader(); 1122 DebugLoc DL = TheLoop->getStartLoc(); 1123 1124 if (I) { 1125 CodeRegion = I->getParent(); 1126 // If there is no debug location attached to the instruction, revert back to 1127 // using the loop's. 1128 if (I->getDebugLoc()) 1129 DL = I->getDebugLoc(); 1130 } 1131 1132 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1133 R << "loop not vectorized: "; 1134 return R; 1135 } 1136 1137 /// Return a value for Step multiplied by VF. 1138 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1139 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1140 Constant *StepVal = ConstantInt::get( 1141 Step->getType(), 1142 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1143 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1144 } 1145 1146 namespace llvm { 1147 1148 void reportVectorizationFailure(const StringRef DebugMsg, 1149 const StringRef OREMsg, const StringRef ORETag, 1150 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1151 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1152 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1153 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1154 ORETag, TheLoop, I) << OREMsg); 1155 } 1156 1157 } // end namespace llvm 1158 1159 #ifndef NDEBUG 1160 /// \return string containing a file name and a line # for the given loop. 1161 static std::string getDebugLocString(const Loop *L) { 1162 std::string Result; 1163 if (L) { 1164 raw_string_ostream OS(Result); 1165 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1166 LoopDbgLoc.print(OS); 1167 else 1168 // Just print the module name. 1169 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1170 OS.flush(); 1171 } 1172 return Result; 1173 } 1174 #endif 1175 1176 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1177 const Instruction *Orig) { 1178 // If the loop was versioned with memchecks, add the corresponding no-alias 1179 // metadata. 1180 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1181 LVer->annotateInstWithNoAlias(To, Orig); 1182 } 1183 1184 void InnerLoopVectorizer::addMetadata(Instruction *To, 1185 Instruction *From) { 1186 propagateMetadata(To, From); 1187 addNewMetadata(To, From); 1188 } 1189 1190 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1191 Instruction *From) { 1192 for (Value *V : To) { 1193 if (Instruction *I = dyn_cast<Instruction>(V)) 1194 addMetadata(I, From); 1195 } 1196 } 1197 1198 namespace llvm { 1199 1200 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1201 // lowered. 1202 enum ScalarEpilogueLowering { 1203 1204 // The default: allowing scalar epilogues. 1205 CM_ScalarEpilogueAllowed, 1206 1207 // Vectorization with OptForSize: don't allow epilogues. 1208 CM_ScalarEpilogueNotAllowedOptSize, 1209 1210 // A special case of vectorisation with OptForSize: loops with a very small 1211 // trip count are considered for vectorization under OptForSize, thereby 1212 // making sure the cost of their loop body is dominant, free of runtime 1213 // guards and scalar iteration overheads. 1214 CM_ScalarEpilogueNotAllowedLowTripLoop, 1215 1216 // Loop hint predicate indicating an epilogue is undesired. 1217 CM_ScalarEpilogueNotNeededUsePredicate, 1218 1219 // Directive indicating we must either tail fold or not vectorize 1220 CM_ScalarEpilogueNotAllowedUsePredicate 1221 }; 1222 1223 /// LoopVectorizationCostModel - estimates the expected speedups due to 1224 /// vectorization. 1225 /// In many cases vectorization is not profitable. This can happen because of 1226 /// a number of reasons. In this class we mainly attempt to predict the 1227 /// expected speedup/slowdowns due to the supported instruction set. We use the 1228 /// TargetTransformInfo to query the different backends for the cost of 1229 /// different operations. 1230 class LoopVectorizationCostModel { 1231 public: 1232 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1233 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1234 LoopVectorizationLegality *Legal, 1235 const TargetTransformInfo &TTI, 1236 const TargetLibraryInfo *TLI, DemandedBits *DB, 1237 AssumptionCache *AC, 1238 OptimizationRemarkEmitter *ORE, const Function *F, 1239 const LoopVectorizeHints *Hints, 1240 InterleavedAccessInfo &IAI) 1241 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1242 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1243 Hints(Hints), InterleaveInfo(IAI) {} 1244 1245 /// \return An upper bound for the vectorization factor, or None if 1246 /// vectorization and interleaving should be avoided up front. 1247 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1248 1249 /// \return True if runtime checks are required for vectorization, and false 1250 /// otherwise. 1251 bool runtimeChecksRequired(); 1252 1253 /// \return The most profitable vectorization factor and the cost of that VF. 1254 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1255 /// then this vectorization factor will be selected if vectorization is 1256 /// possible. 1257 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1258 VectorizationFactor 1259 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1260 const LoopVectorizationPlanner &LVP); 1261 1262 /// Setup cost-based decisions for user vectorization factor. 1263 void selectUserVectorizationFactor(ElementCount UserVF) { 1264 collectUniformsAndScalars(UserVF); 1265 collectInstsToScalarize(UserVF); 1266 } 1267 1268 /// \return The size (in bits) of the smallest and widest types in the code 1269 /// that needs to be vectorized. We ignore values that remain scalar such as 1270 /// 64 bit loop indices. 1271 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1272 1273 /// \return The desired interleave count. 1274 /// If interleave count has been specified by metadata it will be returned. 1275 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1276 /// are the selected vectorization factor and the cost of the selected VF. 1277 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1278 1279 /// Memory access instruction may be vectorized in more than one way. 1280 /// Form of instruction after vectorization depends on cost. 1281 /// This function takes cost-based decisions for Load/Store instructions 1282 /// and collects them in a map. This decisions map is used for building 1283 /// the lists of loop-uniform and loop-scalar instructions. 1284 /// The calculated cost is saved with widening decision in order to 1285 /// avoid redundant calculations. 1286 void setCostBasedWideningDecision(ElementCount VF); 1287 1288 /// A struct that represents some properties of the register usage 1289 /// of a loop. 1290 struct RegisterUsage { 1291 /// Holds the number of loop invariant values that are used in the loop. 1292 /// The key is ClassID of target-provided register class. 1293 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1294 /// Holds the maximum number of concurrent live intervals in the loop. 1295 /// The key is ClassID of target-provided register class. 1296 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1297 }; 1298 1299 /// \return Returns information about the register usages of the loop for the 1300 /// given vectorization factors. 1301 SmallVector<RegisterUsage, 8> 1302 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1303 1304 /// Collect values we want to ignore in the cost model. 1305 void collectValuesToIgnore(); 1306 1307 /// Split reductions into those that happen in the loop, and those that happen 1308 /// outside. In loop reductions are collected into InLoopReductionChains. 1309 void collectInLoopReductions(); 1310 1311 /// \returns The smallest bitwidth each instruction can be represented with. 1312 /// The vector equivalents of these instructions should be truncated to this 1313 /// type. 1314 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1315 return MinBWs; 1316 } 1317 1318 /// \returns True if it is more profitable to scalarize instruction \p I for 1319 /// vectorization factor \p VF. 1320 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1321 assert(VF.isVector() && 1322 "Profitable to scalarize relevant only for VF > 1."); 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto Scalars = InstsToScalarize.find(VF); 1330 assert(Scalars != InstsToScalarize.end() && 1331 "VF not yet analyzed for scalarization profitability"); 1332 return Scalars->second.find(I) != Scalars->second.end(); 1333 } 1334 1335 /// Returns true if \p I is known to be uniform after vectorization. 1336 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto UniformsPerVF = Uniforms.find(VF); 1346 assert(UniformsPerVF != Uniforms.end() && 1347 "VF not yet analyzed for uniformity"); 1348 return UniformsPerVF->second.count(I); 1349 } 1350 1351 /// Returns true if \p I is known to be scalar after vectorization. 1352 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1353 if (VF.isScalar()) 1354 return true; 1355 1356 // Cost model is not run in the VPlan-native path - return conservative 1357 // result until this changes. 1358 if (EnableVPlanNativePath) 1359 return false; 1360 1361 auto ScalarsPerVF = Scalars.find(VF); 1362 assert(ScalarsPerVF != Scalars.end() && 1363 "Scalar values are not calculated for VF"); 1364 return ScalarsPerVF->second.count(I); 1365 } 1366 1367 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1368 /// for vectorization factor \p VF. 1369 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1370 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1371 !isProfitableToScalarize(I, VF) && 1372 !isScalarAfterVectorization(I, VF); 1373 } 1374 1375 /// Decision that was taken during cost calculation for memory instruction. 1376 enum InstWidening { 1377 CM_Unknown, 1378 CM_Widen, // For consecutive accesses with stride +1. 1379 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1380 CM_Interleave, 1381 CM_GatherScatter, 1382 CM_Scalarize 1383 }; 1384 1385 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1386 /// instruction \p I and vector width \p VF. 1387 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1388 InstructionCost Cost) { 1389 assert(VF.isVector() && "Expected VF >=2"); 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 } 1392 1393 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1394 /// interleaving group \p Grp and vector width \p VF. 1395 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1396 ElementCount VF, InstWidening W, 1397 InstructionCost Cost) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 /// Broadcast this decicion to all instructions inside the group. 1400 /// But the cost will be assigned to one instruction only. 1401 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1402 if (auto *I = Grp->getMember(i)) { 1403 if (Grp->getInsertPos() == I) 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1405 else 1406 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1407 } 1408 } 1409 } 1410 1411 /// Return the cost model decision for the given instruction \p I and vector 1412 /// width \p VF. Return CM_Unknown if this instruction did not pass 1413 /// through the cost modeling. 1414 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1415 assert(VF.isVector() && "Expected VF to be a vector VF"); 1416 // Cost model is not run in the VPlan-native path - return conservative 1417 // result until this changes. 1418 if (EnableVPlanNativePath) 1419 return CM_GatherScatter; 1420 1421 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1422 auto Itr = WideningDecisions.find(InstOnVF); 1423 if (Itr == WideningDecisions.end()) 1424 return CM_Unknown; 1425 return Itr->second.first; 1426 } 1427 1428 /// Return the vectorization cost for the given instruction \p I and vector 1429 /// width \p VF. 1430 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1431 assert(VF.isVector() && "Expected VF >=2"); 1432 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1433 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1434 "The cost is not calculated"); 1435 return WideningDecisions[InstOnVF].second; 1436 } 1437 1438 /// Return True if instruction \p I is an optimizable truncate whose operand 1439 /// is an induction variable. Such a truncate will be removed by adding a new 1440 /// induction variable with the destination type. 1441 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1442 // If the instruction is not a truncate, return false. 1443 auto *Trunc = dyn_cast<TruncInst>(I); 1444 if (!Trunc) 1445 return false; 1446 1447 // Get the source and destination types of the truncate. 1448 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1449 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1450 1451 // If the truncate is free for the given types, return false. Replacing a 1452 // free truncate with an induction variable would add an induction variable 1453 // update instruction to each iteration of the loop. We exclude from this 1454 // check the primary induction variable since it will need an update 1455 // instruction regardless. 1456 Value *Op = Trunc->getOperand(0); 1457 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1458 return false; 1459 1460 // If the truncated value is not an induction variable, return false. 1461 return Legal->isInductionPhi(Op); 1462 } 1463 1464 /// Collects the instructions to scalarize for each predicated instruction in 1465 /// the loop. 1466 void collectInstsToScalarize(ElementCount VF); 1467 1468 /// Collect Uniform and Scalar values for the given \p VF. 1469 /// The sets depend on CM decision for Load/Store instructions 1470 /// that may be vectorized as interleave, gather-scatter or scalarized. 1471 void collectUniformsAndScalars(ElementCount VF) { 1472 // Do the analysis once. 1473 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1474 return; 1475 setCostBasedWideningDecision(VF); 1476 collectLoopUniforms(VF); 1477 collectLoopScalars(VF); 1478 } 1479 1480 /// Returns true if the target machine supports masked store operation 1481 /// for the given \p DataType and kind of access to \p Ptr. 1482 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1483 return Legal->isConsecutivePtr(Ptr) && 1484 TTI.isLegalMaskedStore(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked load operation 1488 /// for the given \p DataType and kind of access to \p Ptr. 1489 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1490 return Legal->isConsecutivePtr(Ptr) && 1491 TTI.isLegalMaskedLoad(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked scatter operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1497 return TTI.isLegalMaskedScatter(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked gather operation 1501 /// for the given \p DataType. 1502 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1503 return TTI.isLegalMaskedGather(DataType, Alignment); 1504 } 1505 1506 /// Returns true if the target machine can represent \p V as a masked gather 1507 /// or scatter operation. 1508 bool isLegalGatherOrScatter(Value *V) { 1509 bool LI = isa<LoadInst>(V); 1510 bool SI = isa<StoreInst>(V); 1511 if (!LI && !SI) 1512 return false; 1513 auto *Ty = getMemInstValueType(V); 1514 Align Align = getLoadStoreAlignment(V); 1515 return (LI && isLegalMaskedGather(Ty, Align)) || 1516 (SI && isLegalMaskedScatter(Ty, Align)); 1517 } 1518 1519 /// Returns true if \p I is an instruction that will be scalarized with 1520 /// predication. Such instructions include conditional stores and 1521 /// instructions that may divide by zero. 1522 /// If a non-zero VF has been calculated, we check if I will be scalarized 1523 /// predication for that VF. 1524 bool isScalarWithPredication(Instruction *I, 1525 ElementCount VF = ElementCount::getFixed(1)); 1526 1527 // Returns true if \p I is an instruction that will be predicated either 1528 // through scalar predication or masked load/store or masked gather/scatter. 1529 // Superset of instructions that return true for isScalarWithPredication. 1530 bool isPredicatedInst(Instruction *I) { 1531 if (!blockNeedsPredication(I->getParent())) 1532 return false; 1533 // Loads and stores that need some form of masked operation are predicated 1534 // instructions. 1535 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1536 return Legal->isMaskRequired(I); 1537 return isScalarWithPredication(I); 1538 } 1539 1540 /// Returns true if \p I is a memory instruction with consecutive memory 1541 /// access that can be widened. 1542 bool 1543 memoryInstructionCanBeWidened(Instruction *I, 1544 ElementCount VF = ElementCount::getFixed(1)); 1545 1546 /// Returns true if \p I is a memory instruction in an interleaved-group 1547 /// of memory accesses that can be vectorized with wide vector loads/stores 1548 /// and shuffles. 1549 bool 1550 interleavedAccessCanBeWidened(Instruction *I, 1551 ElementCount VF = ElementCount::getFixed(1)); 1552 1553 /// Check if \p Instr belongs to any interleaved access group. 1554 bool isAccessInterleaved(Instruction *Instr) { 1555 return InterleaveInfo.isInterleaved(Instr); 1556 } 1557 1558 /// Get the interleaved access group that \p Instr belongs to. 1559 const InterleaveGroup<Instruction> * 1560 getInterleavedAccessGroup(Instruction *Instr) { 1561 return InterleaveInfo.getInterleaveGroup(Instr); 1562 } 1563 1564 /// Returns true if we're required to use a scalar epilogue for at least 1565 /// the final iteration of the original loop. 1566 bool requiresScalarEpilogue() const { 1567 if (!isScalarEpilogueAllowed()) 1568 return false; 1569 // If we might exit from anywhere but the latch, must run the exiting 1570 // iteration in scalar form. 1571 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1572 return true; 1573 return InterleaveInfo.requiresScalarEpilogue(); 1574 } 1575 1576 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1577 /// loop hint annotation. 1578 bool isScalarEpilogueAllowed() const { 1579 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1580 } 1581 1582 /// Returns true if all loop blocks should be masked to fold tail loop. 1583 bool foldTailByMasking() const { return FoldTailByMasking; } 1584 1585 bool blockNeedsPredication(BasicBlock *BB) { 1586 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1587 } 1588 1589 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1590 /// nodes to the chain of instructions representing the reductions. Uses a 1591 /// MapVector to ensure deterministic iteration order. 1592 using ReductionChainMap = 1593 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1594 1595 /// Return the chain of instructions representing an inloop reduction. 1596 const ReductionChainMap &getInLoopReductionChains() const { 1597 return InLoopReductionChains; 1598 } 1599 1600 /// Returns true if the Phi is part of an inloop reduction. 1601 bool isInLoopReduction(PHINode *Phi) const { 1602 return InLoopReductionChains.count(Phi); 1603 } 1604 1605 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1606 /// with factor VF. Return the cost of the instruction, including 1607 /// scalarization overhead if it's needed. 1608 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1609 1610 /// Estimate cost of a call instruction CI if it were vectorized with factor 1611 /// VF. Return the cost of the instruction, including scalarization overhead 1612 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1613 /// scalarized - 1614 /// i.e. either vector version isn't available, or is too expensive. 1615 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1616 bool &NeedToScalarize); 1617 1618 /// Invalidates decisions already taken by the cost model. 1619 void invalidateCostModelingDecisions() { 1620 WideningDecisions.clear(); 1621 Uniforms.clear(); 1622 Scalars.clear(); 1623 } 1624 1625 private: 1626 unsigned NumPredStores = 0; 1627 1628 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1629 /// than zero. One is returned if vectorization should best be avoided due 1630 /// to cost. 1631 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1632 ElementCount UserVF); 1633 1634 /// The vectorization cost is a combination of the cost itself and a boolean 1635 /// indicating whether any of the contributing operations will actually 1636 /// operate on 1637 /// vector values after type legalization in the backend. If this latter value 1638 /// is 1639 /// false, then all operations will be scalarized (i.e. no vectorization has 1640 /// actually taken place). 1641 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1642 1643 /// Returns the expected execution cost. The unit of the cost does 1644 /// not matter because we use the 'cost' units to compare different 1645 /// vector widths. The cost that is returned is *not* normalized by 1646 /// the factor width. 1647 VectorizationCostTy expectedCost(ElementCount VF); 1648 1649 /// Returns the execution time cost of an instruction for a given vector 1650 /// width. Vector width of one means scalar. 1651 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost-computation logic from getInstructionCost which provides 1654 /// the vector type as an output parameter. 1655 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1656 Type *&VectorTy); 1657 1658 /// Calculate vectorization cost of memory instruction \p I. 1659 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost computation for scalarized memory instruction. 1662 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1663 1664 /// The cost computation for interleaving group of memory instructions. 1665 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost computation for Gather/Scatter instruction. 1668 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost computation for widening instruction \p I with consecutive 1671 /// memory access. 1672 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1673 1674 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1675 /// Load: scalar load + broadcast. 1676 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1677 /// element) 1678 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1679 1680 /// Estimate the overhead of scalarizing an instruction. This is a 1681 /// convenience wrapper for the type-based getScalarizationOverhead API. 1682 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1683 1684 /// Returns whether the instruction is a load or store and will be a emitted 1685 /// as a vector operation. 1686 bool isConsecutiveLoadOrStore(Instruction *I); 1687 1688 /// Returns true if an artificially high cost for emulated masked memrefs 1689 /// should be used. 1690 bool useEmulatedMaskMemRefHack(Instruction *I); 1691 1692 /// Map of scalar integer values to the smallest bitwidth they can be legally 1693 /// represented as. The vector equivalents of these values should be truncated 1694 /// to this type. 1695 MapVector<Instruction *, uint64_t> MinBWs; 1696 1697 /// A type representing the costs for instructions if they were to be 1698 /// scalarized rather than vectorized. The entries are Instruction-Cost 1699 /// pairs. 1700 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1701 1702 /// A set containing all BasicBlocks that are known to present after 1703 /// vectorization as a predicated block. 1704 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1705 1706 /// Records whether it is allowed to have the original scalar loop execute at 1707 /// least once. This may be needed as a fallback loop in case runtime 1708 /// aliasing/dependence checks fail, or to handle the tail/remainder 1709 /// iterations when the trip count is unknown or doesn't divide by the VF, 1710 /// or as a peel-loop to handle gaps in interleave-groups. 1711 /// Under optsize and when the trip count is very small we don't allow any 1712 /// iterations to execute in the scalar loop. 1713 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1714 1715 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1716 bool FoldTailByMasking = false; 1717 1718 /// A map holding scalar costs for different vectorization factors. The 1719 /// presence of a cost for an instruction in the mapping indicates that the 1720 /// instruction will be scalarized when vectorizing with the associated 1721 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1722 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1723 1724 /// Holds the instructions known to be uniform after vectorization. 1725 /// The data is collected per VF. 1726 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1727 1728 /// Holds the instructions known to be scalar after vectorization. 1729 /// The data is collected per VF. 1730 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1731 1732 /// Holds the instructions (address computations) that are forced to be 1733 /// scalarized. 1734 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1735 1736 /// PHINodes of the reductions that should be expanded in-loop along with 1737 /// their associated chains of reduction operations, in program order from top 1738 /// (PHI) to bottom 1739 ReductionChainMap InLoopReductionChains; 1740 1741 /// Returns the expected difference in cost from scalarizing the expression 1742 /// feeding a predicated instruction \p PredInst. The instructions to 1743 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1744 /// non-negative return value implies the expression will be scalarized. 1745 /// Currently, only single-use chains are considered for scalarization. 1746 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1747 ElementCount VF); 1748 1749 /// Collect the instructions that are uniform after vectorization. An 1750 /// instruction is uniform if we represent it with a single scalar value in 1751 /// the vectorized loop corresponding to each vector iteration. Examples of 1752 /// uniform instructions include pointer operands of consecutive or 1753 /// interleaved memory accesses. Note that although uniformity implies an 1754 /// instruction will be scalar, the reverse is not true. In general, a 1755 /// scalarized instruction will be represented by VF scalar values in the 1756 /// vectorized loop, each corresponding to an iteration of the original 1757 /// scalar loop. 1758 void collectLoopUniforms(ElementCount VF); 1759 1760 /// Collect the instructions that are scalar after vectorization. An 1761 /// instruction is scalar if it is known to be uniform or will be scalarized 1762 /// during vectorization. Non-uniform scalarized instructions will be 1763 /// represented by VF values in the vectorized loop, each corresponding to an 1764 /// iteration of the original scalar loop. 1765 void collectLoopScalars(ElementCount VF); 1766 1767 /// Keeps cost model vectorization decision and cost for instructions. 1768 /// Right now it is used for memory instructions only. 1769 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1770 std::pair<InstWidening, InstructionCost>>; 1771 1772 DecisionList WideningDecisions; 1773 1774 /// Returns true if \p V is expected to be vectorized and it needs to be 1775 /// extracted. 1776 bool needsExtract(Value *V, ElementCount VF) const { 1777 Instruction *I = dyn_cast<Instruction>(V); 1778 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1779 TheLoop->isLoopInvariant(I)) 1780 return false; 1781 1782 // Assume we can vectorize V (and hence we need extraction) if the 1783 // scalars are not computed yet. This can happen, because it is called 1784 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1785 // the scalars are collected. That should be a safe assumption in most 1786 // cases, because we check if the operands have vectorizable types 1787 // beforehand in LoopVectorizationLegality. 1788 return Scalars.find(VF) == Scalars.end() || 1789 !isScalarAfterVectorization(I, VF); 1790 }; 1791 1792 /// Returns a range containing only operands needing to be extracted. 1793 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1794 ElementCount VF) { 1795 return SmallVector<Value *, 4>(make_filter_range( 1796 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1797 } 1798 1799 /// Determines if we have the infrastructure to vectorize loop \p L and its 1800 /// epilogue, assuming the main loop is vectorized by \p VF. 1801 bool isCandidateForEpilogueVectorization(const Loop &L, 1802 const ElementCount VF) const; 1803 1804 /// Returns true if epilogue vectorization is considered profitable, and 1805 /// false otherwise. 1806 /// \p VF is the vectorization factor chosen for the original loop. 1807 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1808 1809 public: 1810 /// The loop that we evaluate. 1811 Loop *TheLoop; 1812 1813 /// Predicated scalar evolution analysis. 1814 PredicatedScalarEvolution &PSE; 1815 1816 /// Loop Info analysis. 1817 LoopInfo *LI; 1818 1819 /// Vectorization legality. 1820 LoopVectorizationLegality *Legal; 1821 1822 /// Vector target information. 1823 const TargetTransformInfo &TTI; 1824 1825 /// Target Library Info. 1826 const TargetLibraryInfo *TLI; 1827 1828 /// Demanded bits analysis. 1829 DemandedBits *DB; 1830 1831 /// Assumption cache. 1832 AssumptionCache *AC; 1833 1834 /// Interface to emit optimization remarks. 1835 OptimizationRemarkEmitter *ORE; 1836 1837 const Function *TheFunction; 1838 1839 /// Loop Vectorize Hint. 1840 const LoopVectorizeHints *Hints; 1841 1842 /// The interleave access information contains groups of interleaved accesses 1843 /// with the same stride and close to each other. 1844 InterleavedAccessInfo &InterleaveInfo; 1845 1846 /// Values to ignore in the cost model. 1847 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1848 1849 /// Values to ignore in the cost model when VF > 1. 1850 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1851 1852 /// Profitable vector factors. 1853 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1854 }; 1855 1856 } // end namespace llvm 1857 1858 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1859 // vectorization. The loop needs to be annotated with #pragma omp simd 1860 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1861 // vector length information is not provided, vectorization is not considered 1862 // explicit. Interleave hints are not allowed either. These limitations will be 1863 // relaxed in the future. 1864 // Please, note that we are currently forced to abuse the pragma 'clang 1865 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1866 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1867 // provides *explicit vectorization hints* (LV can bypass legal checks and 1868 // assume that vectorization is legal). However, both hints are implemented 1869 // using the same metadata (llvm.loop.vectorize, processed by 1870 // LoopVectorizeHints). This will be fixed in the future when the native IR 1871 // representation for pragma 'omp simd' is introduced. 1872 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1873 OptimizationRemarkEmitter *ORE) { 1874 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1875 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1876 1877 // Only outer loops with an explicit vectorization hint are supported. 1878 // Unannotated outer loops are ignored. 1879 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1880 return false; 1881 1882 Function *Fn = OuterLp->getHeader()->getParent(); 1883 if (!Hints.allowVectorization(Fn, OuterLp, 1884 true /*VectorizeOnlyWhenForced*/)) { 1885 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1886 return false; 1887 } 1888 1889 if (Hints.getInterleave() > 1) { 1890 // TODO: Interleave support is future work. 1891 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1892 "outer loops.\n"); 1893 Hints.emitRemarkWithHints(); 1894 return false; 1895 } 1896 1897 return true; 1898 } 1899 1900 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1901 OptimizationRemarkEmitter *ORE, 1902 SmallVectorImpl<Loop *> &V) { 1903 // Collect inner loops and outer loops without irreducible control flow. For 1904 // now, only collect outer loops that have explicit vectorization hints. If we 1905 // are stress testing the VPlan H-CFG construction, we collect the outermost 1906 // loop of every loop nest. 1907 if (L.isInnermost() || VPlanBuildStressTest || 1908 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1909 LoopBlocksRPO RPOT(&L); 1910 RPOT.perform(LI); 1911 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1912 V.push_back(&L); 1913 // TODO: Collect inner loops inside marked outer loops in case 1914 // vectorization fails for the outer loop. Do not invoke 1915 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1916 // already known to be reducible. We can use an inherited attribute for 1917 // that. 1918 return; 1919 } 1920 } 1921 for (Loop *InnerL : L) 1922 collectSupportedLoops(*InnerL, LI, ORE, V); 1923 } 1924 1925 namespace { 1926 1927 /// The LoopVectorize Pass. 1928 struct LoopVectorize : public FunctionPass { 1929 /// Pass identification, replacement for typeid 1930 static char ID; 1931 1932 LoopVectorizePass Impl; 1933 1934 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1935 bool VectorizeOnlyWhenForced = false) 1936 : FunctionPass(ID), 1937 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1938 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1939 } 1940 1941 bool runOnFunction(Function &F) override { 1942 if (skipFunction(F)) 1943 return false; 1944 1945 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1946 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1947 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1948 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1949 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1950 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1951 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1952 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1953 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1954 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1955 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1956 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1957 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1958 1959 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1960 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1961 1962 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1963 GetLAA, *ORE, PSI).MadeAnyChange; 1964 } 1965 1966 void getAnalysisUsage(AnalysisUsage &AU) const override { 1967 AU.addRequired<AssumptionCacheTracker>(); 1968 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1969 AU.addRequired<DominatorTreeWrapperPass>(); 1970 AU.addRequired<LoopInfoWrapperPass>(); 1971 AU.addRequired<ScalarEvolutionWrapperPass>(); 1972 AU.addRequired<TargetTransformInfoWrapperPass>(); 1973 AU.addRequired<AAResultsWrapperPass>(); 1974 AU.addRequired<LoopAccessLegacyAnalysis>(); 1975 AU.addRequired<DemandedBitsWrapperPass>(); 1976 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1977 AU.addRequired<InjectTLIMappingsLegacy>(); 1978 1979 // We currently do not preserve loopinfo/dominator analyses with outer loop 1980 // vectorization. Until this is addressed, mark these analyses as preserved 1981 // only for non-VPlan-native path. 1982 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1983 if (!EnableVPlanNativePath) { 1984 AU.addPreserved<LoopInfoWrapperPass>(); 1985 AU.addPreserved<DominatorTreeWrapperPass>(); 1986 } 1987 1988 AU.addPreserved<BasicAAWrapperPass>(); 1989 AU.addPreserved<GlobalsAAWrapperPass>(); 1990 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1991 } 1992 }; 1993 1994 } // end anonymous namespace 1995 1996 //===----------------------------------------------------------------------===// 1997 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1998 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1999 //===----------------------------------------------------------------------===// 2000 2001 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2002 // We need to place the broadcast of invariant variables outside the loop, 2003 // but only if it's proven safe to do so. Else, broadcast will be inside 2004 // vector loop body. 2005 Instruction *Instr = dyn_cast<Instruction>(V); 2006 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2007 (!Instr || 2008 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2009 // Place the code for broadcasting invariant variables in the new preheader. 2010 IRBuilder<>::InsertPointGuard Guard(Builder); 2011 if (SafeToHoist) 2012 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2013 2014 // Broadcast the scalar into all locations in the vector. 2015 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2016 2017 return Shuf; 2018 } 2019 2020 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2021 const InductionDescriptor &II, Value *Step, Value *Start, 2022 Instruction *EntryVal) { 2023 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2024 "Expected either an induction phi-node or a truncate of it!"); 2025 2026 // Construct the initial value of the vector IV in the vector loop preheader 2027 auto CurrIP = Builder.saveIP(); 2028 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2029 if (isa<TruncInst>(EntryVal)) { 2030 assert(Start->getType()->isIntegerTy() && 2031 "Truncation requires an integer type"); 2032 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2033 Step = Builder.CreateTrunc(Step, TruncType); 2034 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2035 } 2036 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2037 Value *SteppedStart = 2038 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2039 2040 // We create vector phi nodes for both integer and floating-point induction 2041 // variables. Here, we determine the kind of arithmetic we will perform. 2042 Instruction::BinaryOps AddOp; 2043 Instruction::BinaryOps MulOp; 2044 if (Step->getType()->isIntegerTy()) { 2045 AddOp = Instruction::Add; 2046 MulOp = Instruction::Mul; 2047 } else { 2048 AddOp = II.getInductionOpcode(); 2049 MulOp = Instruction::FMul; 2050 } 2051 2052 // Multiply the vectorization factor by the step using integer or 2053 // floating-point arithmetic as appropriate. 2054 Value *ConstVF = 2055 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2056 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2057 2058 // Create a vector splat to use in the induction update. 2059 // 2060 // FIXME: If the step is non-constant, we create the vector splat with 2061 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2062 // handle a constant vector splat. 2063 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2064 Value *SplatVF = isa<Constant>(Mul) 2065 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2066 : Builder.CreateVectorSplat(VF, Mul); 2067 Builder.restoreIP(CurrIP); 2068 2069 // We may need to add the step a number of times, depending on the unroll 2070 // factor. The last of those goes into the PHI. 2071 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2072 &*LoopVectorBody->getFirstInsertionPt()); 2073 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2074 Instruction *LastInduction = VecInd; 2075 for (unsigned Part = 0; Part < UF; ++Part) { 2076 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2077 2078 if (isa<TruncInst>(EntryVal)) 2079 addMetadata(LastInduction, EntryVal); 2080 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2081 2082 LastInduction = cast<Instruction>(addFastMathFlag( 2083 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2084 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2085 } 2086 2087 // Move the last step to the end of the latch block. This ensures consistent 2088 // placement of all induction updates. 2089 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2090 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2091 auto *ICmp = cast<Instruction>(Br->getCondition()); 2092 LastInduction->moveBefore(ICmp); 2093 LastInduction->setName("vec.ind.next"); 2094 2095 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2096 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2097 } 2098 2099 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2100 return Cost->isScalarAfterVectorization(I, VF) || 2101 Cost->isProfitableToScalarize(I, VF); 2102 } 2103 2104 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2105 if (shouldScalarizeInstruction(IV)) 2106 return true; 2107 auto isScalarInst = [&](User *U) -> bool { 2108 auto *I = cast<Instruction>(U); 2109 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2110 }; 2111 return llvm::any_of(IV->users(), isScalarInst); 2112 } 2113 2114 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2115 const InductionDescriptor &ID, const Instruction *EntryVal, 2116 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2117 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2118 "Expected either an induction phi-node or a truncate of it!"); 2119 2120 // This induction variable is not the phi from the original loop but the 2121 // newly-created IV based on the proof that casted Phi is equal to the 2122 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2123 // re-uses the same InductionDescriptor that original IV uses but we don't 2124 // have to do any recording in this case - that is done when original IV is 2125 // processed. 2126 if (isa<TruncInst>(EntryVal)) 2127 return; 2128 2129 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2130 if (Casts.empty()) 2131 return; 2132 // Only the first Cast instruction in the Casts vector is of interest. 2133 // The rest of the Casts (if exist) have no uses outside the 2134 // induction update chain itself. 2135 Instruction *CastInst = *Casts.begin(); 2136 if (Lane < UINT_MAX) 2137 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2138 else 2139 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2140 } 2141 2142 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2143 TruncInst *Trunc) { 2144 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2145 "Primary induction variable must have an integer type"); 2146 2147 auto II = Legal->getInductionVars().find(IV); 2148 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2149 2150 auto ID = II->second; 2151 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2152 2153 // The value from the original loop to which we are mapping the new induction 2154 // variable. 2155 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2156 2157 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2158 2159 // Generate code for the induction step. Note that induction steps are 2160 // required to be loop-invariant 2161 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2162 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2163 "Induction step should be loop invariant"); 2164 if (PSE.getSE()->isSCEVable(IV->getType())) { 2165 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2166 return Exp.expandCodeFor(Step, Step->getType(), 2167 LoopVectorPreHeader->getTerminator()); 2168 } 2169 return cast<SCEVUnknown>(Step)->getValue(); 2170 }; 2171 2172 // The scalar value to broadcast. This is derived from the canonical 2173 // induction variable. If a truncation type is given, truncate the canonical 2174 // induction variable and step. Otherwise, derive these values from the 2175 // induction descriptor. 2176 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2177 Value *ScalarIV = Induction; 2178 if (IV != OldInduction) { 2179 ScalarIV = IV->getType()->isIntegerTy() 2180 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2181 : Builder.CreateCast(Instruction::SIToFP, Induction, 2182 IV->getType()); 2183 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2184 ScalarIV->setName("offset.idx"); 2185 } 2186 if (Trunc) { 2187 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2188 assert(Step->getType()->isIntegerTy() && 2189 "Truncation requires an integer step"); 2190 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2191 Step = Builder.CreateTrunc(Step, TruncType); 2192 } 2193 return ScalarIV; 2194 }; 2195 2196 // Create the vector values from the scalar IV, in the absence of creating a 2197 // vector IV. 2198 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2199 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2200 for (unsigned Part = 0; Part < UF; ++Part) { 2201 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2202 Value *EntryPart = 2203 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2204 ID.getInductionOpcode()); 2205 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2206 if (Trunc) 2207 addMetadata(EntryPart, Trunc); 2208 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2209 } 2210 }; 2211 2212 // Now do the actual transformations, and start with creating the step value. 2213 Value *Step = CreateStepValue(ID.getStep()); 2214 if (VF.isZero() || VF.isScalar()) { 2215 Value *ScalarIV = CreateScalarIV(Step); 2216 CreateSplatIV(ScalarIV, Step); 2217 return; 2218 } 2219 2220 // Determine if we want a scalar version of the induction variable. This is 2221 // true if the induction variable itself is not widened, or if it has at 2222 // least one user in the loop that is not widened. 2223 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2224 if (!NeedsScalarIV) { 2225 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2226 return; 2227 } 2228 2229 // Try to create a new independent vector induction variable. If we can't 2230 // create the phi node, we will splat the scalar induction variable in each 2231 // loop iteration. 2232 if (!shouldScalarizeInstruction(EntryVal)) { 2233 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2234 Value *ScalarIV = CreateScalarIV(Step); 2235 // Create scalar steps that can be used by instructions we will later 2236 // scalarize. Note that the addition of the scalar steps will not increase 2237 // the number of instructions in the loop in the common case prior to 2238 // InstCombine. We will be trading one vector extract for each scalar step. 2239 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2240 return; 2241 } 2242 2243 // All IV users are scalar instructions, so only emit a scalar IV, not a 2244 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2245 // predicate used by the masked loads/stores. 2246 Value *ScalarIV = CreateScalarIV(Step); 2247 if (!Cost->isScalarEpilogueAllowed()) 2248 CreateSplatIV(ScalarIV, Step); 2249 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2250 } 2251 2252 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2253 Instruction::BinaryOps BinOp) { 2254 // Create and check the types. 2255 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2256 int VLen = ValVTy->getNumElements(); 2257 2258 Type *STy = Val->getType()->getScalarType(); 2259 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2260 "Induction Step must be an integer or FP"); 2261 assert(Step->getType() == STy && "Step has wrong type"); 2262 2263 SmallVector<Constant *, 8> Indices; 2264 2265 if (STy->isIntegerTy()) { 2266 // Create a vector of consecutive numbers from zero to VF. 2267 for (int i = 0; i < VLen; ++i) 2268 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2269 2270 // Add the consecutive indices to the vector value. 2271 Constant *Cv = ConstantVector::get(Indices); 2272 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2273 Step = Builder.CreateVectorSplat(VLen, Step); 2274 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2275 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2276 // which can be found from the original scalar operations. 2277 Step = Builder.CreateMul(Cv, Step); 2278 return Builder.CreateAdd(Val, Step, "induction"); 2279 } 2280 2281 // Floating point induction. 2282 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2283 "Binary Opcode should be specified for FP induction"); 2284 // Create a vector of consecutive numbers from zero to VF. 2285 for (int i = 0; i < VLen; ++i) 2286 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2287 2288 // Add the consecutive indices to the vector value. 2289 Constant *Cv = ConstantVector::get(Indices); 2290 2291 Step = Builder.CreateVectorSplat(VLen, Step); 2292 2293 // Floating point operations had to be 'fast' to enable the induction. 2294 FastMathFlags Flags; 2295 Flags.setFast(); 2296 2297 Value *MulOp = Builder.CreateFMul(Cv, Step); 2298 if (isa<Instruction>(MulOp)) 2299 // Have to check, MulOp may be a constant 2300 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2301 2302 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2303 if (isa<Instruction>(BOp)) 2304 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2305 return BOp; 2306 } 2307 2308 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2309 Instruction *EntryVal, 2310 const InductionDescriptor &ID) { 2311 // We shouldn't have to build scalar steps if we aren't vectorizing. 2312 assert(VF.isVector() && "VF should be greater than one"); 2313 // Get the value type and ensure it and the step have the same integer type. 2314 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2315 assert(ScalarIVTy == Step->getType() && 2316 "Val and Step should have the same type"); 2317 2318 // We build scalar steps for both integer and floating-point induction 2319 // variables. Here, we determine the kind of arithmetic we will perform. 2320 Instruction::BinaryOps AddOp; 2321 Instruction::BinaryOps MulOp; 2322 if (ScalarIVTy->isIntegerTy()) { 2323 AddOp = Instruction::Add; 2324 MulOp = Instruction::Mul; 2325 } else { 2326 AddOp = ID.getInductionOpcode(); 2327 MulOp = Instruction::FMul; 2328 } 2329 2330 // Determine the number of scalars we need to generate for each unroll 2331 // iteration. If EntryVal is uniform, we only need to generate the first 2332 // lane. Otherwise, we generate all VF values. 2333 unsigned Lanes = 2334 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2335 ? 1 2336 : VF.getKnownMinValue(); 2337 assert((!VF.isScalable() || Lanes == 1) && 2338 "Should never scalarize a scalable vector"); 2339 // Compute the scalar steps and save the results in VectorLoopValueMap. 2340 for (unsigned Part = 0; Part < UF; ++Part) { 2341 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2342 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2343 ScalarIVTy->getScalarSizeInBits()); 2344 Value *StartIdx = 2345 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2346 if (ScalarIVTy->isFloatingPointTy()) 2347 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2348 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2349 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2350 // The step returned by `createStepForVF` is a runtime-evaluated value 2351 // when VF is scalable. Otherwise, it should be folded into a Constant. 2352 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2353 "Expected StartIdx to be folded to a constant when VF is not " 2354 "scalable"); 2355 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2356 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2357 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2358 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2359 } 2360 } 2361 } 2362 2363 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2364 assert(V != Induction && "The new induction variable should not be used."); 2365 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2366 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2367 2368 // If we have a stride that is replaced by one, do it here. Defer this for 2369 // the VPlan-native path until we start running Legal checks in that path. 2370 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2371 V = ConstantInt::get(V->getType(), 1); 2372 2373 // If we have a vector mapped to this value, return it. 2374 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2375 return VectorLoopValueMap.getVectorValue(V, Part); 2376 2377 // If the value has not been vectorized, check if it has been scalarized 2378 // instead. If it has been scalarized, and we actually need the value in 2379 // vector form, we will construct the vector values on demand. 2380 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2381 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2382 2383 // If we've scalarized a value, that value should be an instruction. 2384 auto *I = cast<Instruction>(V); 2385 2386 // If we aren't vectorizing, we can just copy the scalar map values over to 2387 // the vector map. 2388 if (VF.isScalar()) { 2389 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2390 return ScalarValue; 2391 } 2392 2393 // Get the last scalar instruction we generated for V and Part. If the value 2394 // is known to be uniform after vectorization, this corresponds to lane zero 2395 // of the Part unroll iteration. Otherwise, the last instruction is the one 2396 // we created for the last vector lane of the Part unroll iteration. 2397 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2398 ? 0 2399 : VF.getKnownMinValue() - 1; 2400 assert((!VF.isScalable() || LastLane == 0) && 2401 "Scalable vectorization can't lead to any scalarized values."); 2402 auto *LastInst = cast<Instruction>( 2403 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2404 2405 // Set the insert point after the last scalarized instruction. This ensures 2406 // the insertelement sequence will directly follow the scalar definitions. 2407 auto OldIP = Builder.saveIP(); 2408 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2409 Builder.SetInsertPoint(&*NewIP); 2410 2411 // However, if we are vectorizing, we need to construct the vector values. 2412 // If the value is known to be uniform after vectorization, we can just 2413 // broadcast the scalar value corresponding to lane zero for each unroll 2414 // iteration. Otherwise, we construct the vector values using insertelement 2415 // instructions. Since the resulting vectors are stored in 2416 // VectorLoopValueMap, we will only generate the insertelements once. 2417 Value *VectorValue = nullptr; 2418 if (Cost->isUniformAfterVectorization(I, VF)) { 2419 VectorValue = getBroadcastInstrs(ScalarValue); 2420 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2421 } else { 2422 // Initialize packing with insertelements to start from poison. 2423 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2424 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2425 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2426 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2427 packScalarIntoVectorValue(V, {Part, Lane}); 2428 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2429 } 2430 Builder.restoreIP(OldIP); 2431 return VectorValue; 2432 } 2433 2434 // If this scalar is unknown, assume that it is a constant or that it is 2435 // loop invariant. Broadcast V and save the value for future uses. 2436 Value *B = getBroadcastInstrs(V); 2437 VectorLoopValueMap.setVectorValue(V, Part, B); 2438 return B; 2439 } 2440 2441 Value * 2442 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2443 const VPIteration &Instance) { 2444 // If the value is not an instruction contained in the loop, it should 2445 // already be scalar. 2446 if (OrigLoop->isLoopInvariant(V)) 2447 return V; 2448 2449 assert(Instance.Lane > 0 2450 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2451 : true && "Uniform values only have lane zero"); 2452 2453 // If the value from the original loop has not been vectorized, it is 2454 // represented by UF x VF scalar values in the new loop. Return the requested 2455 // scalar value. 2456 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2457 return VectorLoopValueMap.getScalarValue(V, Instance); 2458 2459 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2460 // for the given unroll part. If this entry is not a vector type (i.e., the 2461 // vectorization factor is one), there is no need to generate an 2462 // extractelement instruction. 2463 auto *U = getOrCreateVectorValue(V, Instance.Part); 2464 if (!U->getType()->isVectorTy()) { 2465 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2466 return U; 2467 } 2468 2469 // Otherwise, the value from the original loop has been vectorized and is 2470 // represented by UF vector values. Extract and return the requested scalar 2471 // value from the appropriate vector lane. 2472 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2473 } 2474 2475 void InnerLoopVectorizer::packScalarIntoVectorValue( 2476 Value *V, const VPIteration &Instance) { 2477 assert(V != Induction && "The new induction variable should not be used."); 2478 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2479 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2480 2481 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2482 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2483 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2484 Builder.getInt32(Instance.Lane)); 2485 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2486 } 2487 2488 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2489 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2490 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2491 SmallVector<int, 8> ShuffleMask; 2492 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2493 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2494 2495 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2496 } 2497 2498 // Return whether we allow using masked interleave-groups (for dealing with 2499 // strided loads/stores that reside in predicated blocks, or for dealing 2500 // with gaps). 2501 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2502 // If an override option has been passed in for interleaved accesses, use it. 2503 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2504 return EnableMaskedInterleavedMemAccesses; 2505 2506 return TTI.enableMaskedInterleavedAccessVectorization(); 2507 } 2508 2509 // Try to vectorize the interleave group that \p Instr belongs to. 2510 // 2511 // E.g. Translate following interleaved load group (factor = 3): 2512 // for (i = 0; i < N; i+=3) { 2513 // R = Pic[i]; // Member of index 0 2514 // G = Pic[i+1]; // Member of index 1 2515 // B = Pic[i+2]; // Member of index 2 2516 // ... // do something to R, G, B 2517 // } 2518 // To: 2519 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2520 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2521 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2522 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2523 // 2524 // Or translate following interleaved store group (factor = 3): 2525 // for (i = 0; i < N; i+=3) { 2526 // ... do something to R, G, B 2527 // Pic[i] = R; // Member of index 0 2528 // Pic[i+1] = G; // Member of index 1 2529 // Pic[i+2] = B; // Member of index 2 2530 // } 2531 // To: 2532 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2533 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2534 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2535 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2536 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2537 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2538 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2539 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2540 VPValue *BlockInMask) { 2541 Instruction *Instr = Group->getInsertPos(); 2542 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2543 2544 // Prepare for the vector type of the interleaved load/store. 2545 Type *ScalarTy = getMemInstValueType(Instr); 2546 unsigned InterleaveFactor = Group->getFactor(); 2547 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2548 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2549 2550 // Prepare for the new pointers. 2551 SmallVector<Value *, 2> AddrParts; 2552 unsigned Index = Group->getIndex(Instr); 2553 2554 // TODO: extend the masked interleaved-group support to reversed access. 2555 assert((!BlockInMask || !Group->isReverse()) && 2556 "Reversed masked interleave-group not supported."); 2557 2558 // If the group is reverse, adjust the index to refer to the last vector lane 2559 // instead of the first. We adjust the index from the first vector lane, 2560 // rather than directly getting the pointer for lane VF - 1, because the 2561 // pointer operand of the interleaved access is supposed to be uniform. For 2562 // uniform instructions, we're only required to generate a value for the 2563 // first vector lane in each unroll iteration. 2564 assert(!VF.isScalable() && 2565 "scalable vector reverse operation is not implemented"); 2566 if (Group->isReverse()) 2567 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2568 2569 for (unsigned Part = 0; Part < UF; Part++) { 2570 Value *AddrPart = State.get(Addr, {Part, 0}); 2571 setDebugLocFromInst(Builder, AddrPart); 2572 2573 // Notice current instruction could be any index. Need to adjust the address 2574 // to the member of index 0. 2575 // 2576 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2577 // b = A[i]; // Member of index 0 2578 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2579 // 2580 // E.g. A[i+1] = a; // Member of index 1 2581 // A[i] = b; // Member of index 0 2582 // A[i+2] = c; // Member of index 2 (Current instruction) 2583 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2584 2585 bool InBounds = false; 2586 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2587 InBounds = gep->isInBounds(); 2588 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2589 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2590 2591 // Cast to the vector pointer type. 2592 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2593 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2594 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2595 } 2596 2597 setDebugLocFromInst(Builder, Instr); 2598 Value *PoisonVec = PoisonValue::get(VecTy); 2599 2600 Value *MaskForGaps = nullptr; 2601 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2602 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2603 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2604 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2605 } 2606 2607 // Vectorize the interleaved load group. 2608 if (isa<LoadInst>(Instr)) { 2609 // For each unroll part, create a wide load for the group. 2610 SmallVector<Value *, 2> NewLoads; 2611 for (unsigned Part = 0; Part < UF; Part++) { 2612 Instruction *NewLoad; 2613 if (BlockInMask || MaskForGaps) { 2614 assert(useMaskedInterleavedAccesses(*TTI) && 2615 "masked interleaved groups are not allowed."); 2616 Value *GroupMask = MaskForGaps; 2617 if (BlockInMask) { 2618 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2619 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2620 Value *ShuffledMask = Builder.CreateShuffleVector( 2621 BlockInMaskPart, 2622 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2623 "interleaved.mask"); 2624 GroupMask = MaskForGaps 2625 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2626 MaskForGaps) 2627 : ShuffledMask; 2628 } 2629 NewLoad = 2630 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2631 GroupMask, PoisonVec, "wide.masked.vec"); 2632 } 2633 else 2634 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2635 Group->getAlign(), "wide.vec"); 2636 Group->addMetadata(NewLoad); 2637 NewLoads.push_back(NewLoad); 2638 } 2639 2640 // For each member in the group, shuffle out the appropriate data from the 2641 // wide loads. 2642 unsigned J = 0; 2643 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2644 Instruction *Member = Group->getMember(I); 2645 2646 // Skip the gaps in the group. 2647 if (!Member) 2648 continue; 2649 2650 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2651 auto StrideMask = 2652 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2653 for (unsigned Part = 0; Part < UF; Part++) { 2654 Value *StridedVec = Builder.CreateShuffleVector( 2655 NewLoads[Part], StrideMask, "strided.vec"); 2656 2657 // If this member has different type, cast the result type. 2658 if (Member->getType() != ScalarTy) { 2659 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2660 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2661 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2662 } 2663 2664 if (Group->isReverse()) 2665 StridedVec = reverseVector(StridedVec); 2666 2667 State.set(VPDefs[J], Member, StridedVec, Part); 2668 } 2669 ++J; 2670 } 2671 return; 2672 } 2673 2674 // The sub vector type for current instruction. 2675 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2676 auto *SubVT = VectorType::get(ScalarTy, VF); 2677 2678 // Vectorize the interleaved store group. 2679 for (unsigned Part = 0; Part < UF; Part++) { 2680 // Collect the stored vector from each member. 2681 SmallVector<Value *, 4> StoredVecs; 2682 for (unsigned i = 0; i < InterleaveFactor; i++) { 2683 // Interleaved store group doesn't allow a gap, so each index has a member 2684 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2685 2686 Value *StoredVec = State.get(StoredValues[i], Part); 2687 2688 if (Group->isReverse()) 2689 StoredVec = reverseVector(StoredVec); 2690 2691 // If this member has different type, cast it to a unified type. 2692 2693 if (StoredVec->getType() != SubVT) 2694 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2695 2696 StoredVecs.push_back(StoredVec); 2697 } 2698 2699 // Concatenate all vectors into a wide vector. 2700 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2701 2702 // Interleave the elements in the wide vector. 2703 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2704 Value *IVec = Builder.CreateShuffleVector( 2705 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2706 "interleaved.vec"); 2707 2708 Instruction *NewStoreInstr; 2709 if (BlockInMask) { 2710 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2711 Value *ShuffledMask = Builder.CreateShuffleVector( 2712 BlockInMaskPart, 2713 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2714 "interleaved.mask"); 2715 NewStoreInstr = Builder.CreateMaskedStore( 2716 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2717 } 2718 else 2719 NewStoreInstr = 2720 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2721 2722 Group->addMetadata(NewStoreInstr); 2723 } 2724 } 2725 2726 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2727 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2728 VPValue *StoredValue, VPValue *BlockInMask) { 2729 // Attempt to issue a wide load. 2730 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2731 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2732 2733 assert((LI || SI) && "Invalid Load/Store instruction"); 2734 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2735 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2736 2737 LoopVectorizationCostModel::InstWidening Decision = 2738 Cost->getWideningDecision(Instr, VF); 2739 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2740 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2741 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2742 "CM decision is not to widen the memory instruction"); 2743 2744 Type *ScalarDataTy = getMemInstValueType(Instr); 2745 2746 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2747 const Align Alignment = getLoadStoreAlignment(Instr); 2748 2749 // Determine if the pointer operand of the access is either consecutive or 2750 // reverse consecutive. 2751 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2752 bool ConsecutiveStride = 2753 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2754 bool CreateGatherScatter = 2755 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2756 2757 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2758 // gather/scatter. Otherwise Decision should have been to Scalarize. 2759 assert((ConsecutiveStride || CreateGatherScatter) && 2760 "The instruction should be scalarized"); 2761 (void)ConsecutiveStride; 2762 2763 VectorParts BlockInMaskParts(UF); 2764 bool isMaskRequired = BlockInMask; 2765 if (isMaskRequired) 2766 for (unsigned Part = 0; Part < UF; ++Part) 2767 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2768 2769 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2770 // Calculate the pointer for the specific unroll-part. 2771 GetElementPtrInst *PartPtr = nullptr; 2772 2773 bool InBounds = false; 2774 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2775 InBounds = gep->isInBounds(); 2776 2777 if (Reverse) { 2778 assert(!VF.isScalable() && 2779 "Reversing vectors is not yet supported for scalable vectors."); 2780 2781 // If the address is consecutive but reversed, then the 2782 // wide store needs to start at the last vector element. 2783 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2784 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2785 PartPtr->setIsInBounds(InBounds); 2786 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2787 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2788 PartPtr->setIsInBounds(InBounds); 2789 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2790 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2791 } else { 2792 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2793 PartPtr = cast<GetElementPtrInst>( 2794 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2795 PartPtr->setIsInBounds(InBounds); 2796 } 2797 2798 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2799 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2800 }; 2801 2802 // Handle Stores: 2803 if (SI) { 2804 setDebugLocFromInst(Builder, SI); 2805 2806 for (unsigned Part = 0; Part < UF; ++Part) { 2807 Instruction *NewSI = nullptr; 2808 Value *StoredVal = State.get(StoredValue, Part); 2809 if (CreateGatherScatter) { 2810 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2811 Value *VectorGep = State.get(Addr, Part); 2812 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2813 MaskPart); 2814 } else { 2815 if (Reverse) { 2816 // If we store to reverse consecutive memory locations, then we need 2817 // to reverse the order of elements in the stored value. 2818 StoredVal = reverseVector(StoredVal); 2819 // We don't want to update the value in the map as it might be used in 2820 // another expression. So don't call resetVectorValue(StoredVal). 2821 } 2822 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2823 if (isMaskRequired) 2824 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2825 BlockInMaskParts[Part]); 2826 else 2827 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2828 } 2829 addMetadata(NewSI, SI); 2830 } 2831 return; 2832 } 2833 2834 // Handle loads. 2835 assert(LI && "Must have a load instruction"); 2836 setDebugLocFromInst(Builder, LI); 2837 for (unsigned Part = 0; Part < UF; ++Part) { 2838 Value *NewLI; 2839 if (CreateGatherScatter) { 2840 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2841 Value *VectorGep = State.get(Addr, Part); 2842 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2843 nullptr, "wide.masked.gather"); 2844 addMetadata(NewLI, LI); 2845 } else { 2846 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2847 if (isMaskRequired) 2848 NewLI = Builder.CreateMaskedLoad( 2849 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2850 "wide.masked.load"); 2851 else 2852 NewLI = 2853 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2854 2855 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2856 addMetadata(NewLI, LI); 2857 if (Reverse) 2858 NewLI = reverseVector(NewLI); 2859 } 2860 2861 State.set(Def, Instr, NewLI, Part); 2862 } 2863 } 2864 2865 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2866 const VPIteration &Instance, 2867 bool IfPredicateInstr, 2868 VPTransformState &State) { 2869 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2870 2871 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2872 // the first lane and part. 2873 if (auto *II = dyn_cast<IntrinsicInst>(Instr)) 2874 if (Instance.Lane != 0 || Instance.Part != 0) 2875 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) 2876 return; 2877 2878 setDebugLocFromInst(Builder, Instr); 2879 2880 // Does this instruction return a value ? 2881 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2882 2883 Instruction *Cloned = Instr->clone(); 2884 if (!IsVoidRetTy) 2885 Cloned->setName(Instr->getName() + ".cloned"); 2886 2887 // Replace the operands of the cloned instructions with their scalar 2888 // equivalents in the new loop. 2889 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2890 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2891 auto InputInstance = Instance; 2892 if (!Operand || !OrigLoop->contains(Operand) || 2893 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2894 InputInstance.Lane = 0; 2895 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2896 Cloned->setOperand(op, NewOp); 2897 } 2898 addNewMetadata(Cloned, Instr); 2899 2900 // Place the cloned scalar in the new loop. 2901 Builder.Insert(Cloned); 2902 2903 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2904 // representing scalar values in VPTransformState. Add the cloned scalar to 2905 // the scalar map entry. 2906 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2907 2908 // If we just cloned a new assumption, add it the assumption cache. 2909 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2910 if (II->getIntrinsicID() == Intrinsic::assume) 2911 AC->registerAssumption(II); 2912 2913 // End if-block. 2914 if (IfPredicateInstr) 2915 PredicatedInstructions.push_back(Cloned); 2916 } 2917 2918 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2919 Value *End, Value *Step, 2920 Instruction *DL) { 2921 BasicBlock *Header = L->getHeader(); 2922 BasicBlock *Latch = L->getLoopLatch(); 2923 // As we're just creating this loop, it's possible no latch exists 2924 // yet. If so, use the header as this will be a single block loop. 2925 if (!Latch) 2926 Latch = Header; 2927 2928 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2929 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2930 setDebugLocFromInst(Builder, OldInst); 2931 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2932 2933 Builder.SetInsertPoint(Latch->getTerminator()); 2934 setDebugLocFromInst(Builder, OldInst); 2935 2936 // Create i+1 and fill the PHINode. 2937 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2938 Induction->addIncoming(Start, L->getLoopPreheader()); 2939 Induction->addIncoming(Next, Latch); 2940 // Create the compare. 2941 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2942 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2943 2944 // Now we have two terminators. Remove the old one from the block. 2945 Latch->getTerminator()->eraseFromParent(); 2946 2947 return Induction; 2948 } 2949 2950 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2951 if (TripCount) 2952 return TripCount; 2953 2954 assert(L && "Create Trip Count for null loop."); 2955 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2956 // Find the loop boundaries. 2957 ScalarEvolution *SE = PSE.getSE(); 2958 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2959 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2960 "Invalid loop count"); 2961 2962 Type *IdxTy = Legal->getWidestInductionType(); 2963 assert(IdxTy && "No type for induction"); 2964 2965 // The exit count might have the type of i64 while the phi is i32. This can 2966 // happen if we have an induction variable that is sign extended before the 2967 // compare. The only way that we get a backedge taken count is that the 2968 // induction variable was signed and as such will not overflow. In such a case 2969 // truncation is legal. 2970 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2971 IdxTy->getPrimitiveSizeInBits()) 2972 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2973 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2974 2975 // Get the total trip count from the count by adding 1. 2976 const SCEV *ExitCount = SE->getAddExpr( 2977 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2978 2979 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2980 2981 // Expand the trip count and place the new instructions in the preheader. 2982 // Notice that the pre-header does not change, only the loop body. 2983 SCEVExpander Exp(*SE, DL, "induction"); 2984 2985 // Count holds the overall loop count (N). 2986 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2987 L->getLoopPreheader()->getTerminator()); 2988 2989 if (TripCount->getType()->isPointerTy()) 2990 TripCount = 2991 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2992 L->getLoopPreheader()->getTerminator()); 2993 2994 return TripCount; 2995 } 2996 2997 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2998 if (VectorTripCount) 2999 return VectorTripCount; 3000 3001 Value *TC = getOrCreateTripCount(L); 3002 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3003 3004 Type *Ty = TC->getType(); 3005 // This is where we can make the step a runtime constant. 3006 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3007 3008 // If the tail is to be folded by masking, round the number of iterations N 3009 // up to a multiple of Step instead of rounding down. This is done by first 3010 // adding Step-1 and then rounding down. Note that it's ok if this addition 3011 // overflows: the vector induction variable will eventually wrap to zero given 3012 // that it starts at zero and its Step is a power of two; the loop will then 3013 // exit, with the last early-exit vector comparison also producing all-true. 3014 if (Cost->foldTailByMasking()) { 3015 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3016 "VF*UF must be a power of 2 when folding tail by masking"); 3017 assert(!VF.isScalable() && 3018 "Tail folding not yet supported for scalable vectors"); 3019 TC = Builder.CreateAdd( 3020 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3021 } 3022 3023 // Now we need to generate the expression for the part of the loop that the 3024 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3025 // iterations are not required for correctness, or N - Step, otherwise. Step 3026 // is equal to the vectorization factor (number of SIMD elements) times the 3027 // unroll factor (number of SIMD instructions). 3028 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3029 3030 // There are two cases where we need to ensure (at least) the last iteration 3031 // runs in the scalar remainder loop. Thus, if the step evenly divides 3032 // the trip count, we set the remainder to be equal to the step. If the step 3033 // does not evenly divide the trip count, no adjustment is necessary since 3034 // there will already be scalar iterations. Note that the minimum iterations 3035 // check ensures that N >= Step. The cases are: 3036 // 1) If there is a non-reversed interleaved group that may speculatively 3037 // access memory out-of-bounds. 3038 // 2) If any instruction may follow a conditionally taken exit. That is, if 3039 // the loop contains multiple exiting blocks, or a single exiting block 3040 // which is not the latch. 3041 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3042 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3043 R = Builder.CreateSelect(IsZero, Step, R); 3044 } 3045 3046 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3047 3048 return VectorTripCount; 3049 } 3050 3051 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3052 const DataLayout &DL) { 3053 // Verify that V is a vector type with same number of elements as DstVTy. 3054 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3055 unsigned VF = DstFVTy->getNumElements(); 3056 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3057 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3058 Type *SrcElemTy = SrcVecTy->getElementType(); 3059 Type *DstElemTy = DstFVTy->getElementType(); 3060 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3061 "Vector elements must have same size"); 3062 3063 // Do a direct cast if element types are castable. 3064 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3065 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3066 } 3067 // V cannot be directly casted to desired vector type. 3068 // May happen when V is a floating point vector but DstVTy is a vector of 3069 // pointers or vice-versa. Handle this using a two-step bitcast using an 3070 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3071 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3072 "Only one type should be a pointer type"); 3073 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3074 "Only one type should be a floating point type"); 3075 Type *IntTy = 3076 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3077 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3078 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3079 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3080 } 3081 3082 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3083 BasicBlock *Bypass) { 3084 Value *Count = getOrCreateTripCount(L); 3085 // Reuse existing vector loop preheader for TC checks. 3086 // Note that new preheader block is generated for vector loop. 3087 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3088 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3089 3090 // Generate code to check if the loop's trip count is less than VF * UF, or 3091 // equal to it in case a scalar epilogue is required; this implies that the 3092 // vector trip count is zero. This check also covers the case where adding one 3093 // to the backedge-taken count overflowed leading to an incorrect trip count 3094 // of zero. In this case we will also jump to the scalar loop. 3095 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3096 : ICmpInst::ICMP_ULT; 3097 3098 // If tail is to be folded, vector loop takes care of all iterations. 3099 Value *CheckMinIters = Builder.getFalse(); 3100 if (!Cost->foldTailByMasking()) { 3101 Value *Step = 3102 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3103 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3104 } 3105 // Create new preheader for vector loop. 3106 LoopVectorPreHeader = 3107 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3108 "vector.ph"); 3109 3110 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3111 DT->getNode(Bypass)->getIDom()) && 3112 "TC check is expected to dominate Bypass"); 3113 3114 // Update dominator for Bypass & LoopExit. 3115 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3116 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3117 3118 ReplaceInstWithInst( 3119 TCCheckBlock->getTerminator(), 3120 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3121 LoopBypassBlocks.push_back(TCCheckBlock); 3122 } 3123 3124 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3125 // Reuse existing vector loop preheader for SCEV checks. 3126 // Note that new preheader block is generated for vector loop. 3127 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3128 3129 // Generate the code to check that the SCEV assumptions that we made. 3130 // We want the new basic block to start at the first instruction in a 3131 // sequence of instructions that form a check. 3132 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3133 "scev.check"); 3134 Value *SCEVCheck = Exp.expandCodeForPredicate( 3135 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3136 3137 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3138 if (C->isZero()) 3139 return; 3140 3141 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3142 (OptForSizeBasedOnProfile && 3143 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3144 "Cannot SCEV check stride or overflow when optimizing for size"); 3145 3146 SCEVCheckBlock->setName("vector.scevcheck"); 3147 // Create new preheader for vector loop. 3148 LoopVectorPreHeader = 3149 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3150 nullptr, "vector.ph"); 3151 3152 // Update dominator only if this is first RT check. 3153 if (LoopBypassBlocks.empty()) { 3154 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3155 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3156 } 3157 3158 ReplaceInstWithInst( 3159 SCEVCheckBlock->getTerminator(), 3160 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3161 LoopBypassBlocks.push_back(SCEVCheckBlock); 3162 AddedSafetyChecks = true; 3163 } 3164 3165 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3166 // VPlan-native path does not do any analysis for runtime checks currently. 3167 if (EnableVPlanNativePath) 3168 return; 3169 3170 // Reuse existing vector loop preheader for runtime memory checks. 3171 // Note that new preheader block is generated for vector loop. 3172 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3173 3174 // Generate the code that checks in runtime if arrays overlap. We put the 3175 // checks into a separate block to make the more common case of few elements 3176 // faster. 3177 auto *LAI = Legal->getLAI(); 3178 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3179 if (!RtPtrChecking.Need) 3180 return; 3181 3182 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3183 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3184 "Cannot emit memory checks when optimizing for size, unless forced " 3185 "to vectorize."); 3186 ORE->emit([&]() { 3187 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3188 L->getStartLoc(), L->getHeader()) 3189 << "Code-size may be reduced by not forcing " 3190 "vectorization, or by source-code modifications " 3191 "eliminating the need for runtime checks " 3192 "(e.g., adding 'restrict')."; 3193 }); 3194 } 3195 3196 MemCheckBlock->setName("vector.memcheck"); 3197 // Create new preheader for vector loop. 3198 LoopVectorPreHeader = 3199 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3200 "vector.ph"); 3201 3202 auto *CondBranch = cast<BranchInst>( 3203 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3204 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3205 LoopBypassBlocks.push_back(MemCheckBlock); 3206 AddedSafetyChecks = true; 3207 3208 // Update dominator only if this is first RT check. 3209 if (LoopBypassBlocks.empty()) { 3210 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3211 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3212 } 3213 3214 Instruction *FirstCheckInst; 3215 Instruction *MemRuntimeCheck; 3216 std::tie(FirstCheckInst, MemRuntimeCheck) = 3217 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3218 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3219 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3220 "claimed checks are required"); 3221 CondBranch->setCondition(MemRuntimeCheck); 3222 3223 // We currently don't use LoopVersioning for the actual loop cloning but we 3224 // still use it to add the noalias metadata. 3225 LVer = std::make_unique<LoopVersioning>( 3226 *Legal->getLAI(), 3227 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3228 DT, PSE.getSE()); 3229 LVer->prepareNoAliasMetadata(); 3230 } 3231 3232 Value *InnerLoopVectorizer::emitTransformedIndex( 3233 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3234 const InductionDescriptor &ID) const { 3235 3236 SCEVExpander Exp(*SE, DL, "induction"); 3237 auto Step = ID.getStep(); 3238 auto StartValue = ID.getStartValue(); 3239 assert(Index->getType() == Step->getType() && 3240 "Index type does not match StepValue type"); 3241 3242 // Note: the IR at this point is broken. We cannot use SE to create any new 3243 // SCEV and then expand it, hoping that SCEV's simplification will give us 3244 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3245 // lead to various SCEV crashes. So all we can do is to use builder and rely 3246 // on InstCombine for future simplifications. Here we handle some trivial 3247 // cases only. 3248 auto CreateAdd = [&B](Value *X, Value *Y) { 3249 assert(X->getType() == Y->getType() && "Types don't match!"); 3250 if (auto *CX = dyn_cast<ConstantInt>(X)) 3251 if (CX->isZero()) 3252 return Y; 3253 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3254 if (CY->isZero()) 3255 return X; 3256 return B.CreateAdd(X, Y); 3257 }; 3258 3259 auto CreateMul = [&B](Value *X, Value *Y) { 3260 assert(X->getType() == Y->getType() && "Types don't match!"); 3261 if (auto *CX = dyn_cast<ConstantInt>(X)) 3262 if (CX->isOne()) 3263 return Y; 3264 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3265 if (CY->isOne()) 3266 return X; 3267 return B.CreateMul(X, Y); 3268 }; 3269 3270 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3271 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3272 // the DomTree is not kept up-to-date for additional blocks generated in the 3273 // vector loop. By using the header as insertion point, we guarantee that the 3274 // expanded instructions dominate all their uses. 3275 auto GetInsertPoint = [this, &B]() { 3276 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3277 if (InsertBB != LoopVectorBody && 3278 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3279 return LoopVectorBody->getTerminator(); 3280 return &*B.GetInsertPoint(); 3281 }; 3282 switch (ID.getKind()) { 3283 case InductionDescriptor::IK_IntInduction: { 3284 assert(Index->getType() == StartValue->getType() && 3285 "Index type does not match StartValue type"); 3286 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3287 return B.CreateSub(StartValue, Index); 3288 auto *Offset = CreateMul( 3289 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3290 return CreateAdd(StartValue, Offset); 3291 } 3292 case InductionDescriptor::IK_PtrInduction: { 3293 assert(isa<SCEVConstant>(Step) && 3294 "Expected constant step for pointer induction"); 3295 return B.CreateGEP( 3296 StartValue->getType()->getPointerElementType(), StartValue, 3297 CreateMul(Index, 3298 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3299 } 3300 case InductionDescriptor::IK_FpInduction: { 3301 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3302 auto InductionBinOp = ID.getInductionBinOp(); 3303 assert(InductionBinOp && 3304 (InductionBinOp->getOpcode() == Instruction::FAdd || 3305 InductionBinOp->getOpcode() == Instruction::FSub) && 3306 "Original bin op should be defined for FP induction"); 3307 3308 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3309 3310 // Floating point operations had to be 'fast' to enable the induction. 3311 FastMathFlags Flags; 3312 Flags.setFast(); 3313 3314 Value *MulExp = B.CreateFMul(StepValue, Index); 3315 if (isa<Instruction>(MulExp)) 3316 // We have to check, the MulExp may be a constant. 3317 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3318 3319 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3320 "induction"); 3321 if (isa<Instruction>(BOp)) 3322 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3323 3324 return BOp; 3325 } 3326 case InductionDescriptor::IK_NoInduction: 3327 return nullptr; 3328 } 3329 llvm_unreachable("invalid enum"); 3330 } 3331 3332 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3333 LoopScalarBody = OrigLoop->getHeader(); 3334 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3335 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3336 assert(LoopExitBlock && "Must have an exit block"); 3337 assert(LoopVectorPreHeader && "Invalid loop structure"); 3338 3339 LoopMiddleBlock = 3340 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3341 LI, nullptr, Twine(Prefix) + "middle.block"); 3342 LoopScalarPreHeader = 3343 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3344 nullptr, Twine(Prefix) + "scalar.ph"); 3345 3346 // Set up branch from middle block to the exit and scalar preheader blocks. 3347 // completeLoopSkeleton will update the condition to use an iteration check, 3348 // if required to decide whether to execute the remainder. 3349 BranchInst *BrInst = 3350 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3351 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3352 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3353 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3354 3355 // We intentionally don't let SplitBlock to update LoopInfo since 3356 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3357 // LoopVectorBody is explicitly added to the correct place few lines later. 3358 LoopVectorBody = 3359 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3360 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3361 3362 // Update dominator for loop exit. 3363 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3364 3365 // Create and register the new vector loop. 3366 Loop *Lp = LI->AllocateLoop(); 3367 Loop *ParentLoop = OrigLoop->getParentLoop(); 3368 3369 // Insert the new loop into the loop nest and register the new basic blocks 3370 // before calling any utilities such as SCEV that require valid LoopInfo. 3371 if (ParentLoop) { 3372 ParentLoop->addChildLoop(Lp); 3373 } else { 3374 LI->addTopLevelLoop(Lp); 3375 } 3376 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3377 return Lp; 3378 } 3379 3380 void InnerLoopVectorizer::createInductionResumeValues( 3381 Loop *L, Value *VectorTripCount, 3382 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3383 assert(VectorTripCount && L && "Expected valid arguments"); 3384 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3385 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3386 "Inconsistent information about additional bypass."); 3387 // We are going to resume the execution of the scalar loop. 3388 // Go over all of the induction variables that we found and fix the 3389 // PHIs that are left in the scalar version of the loop. 3390 // The starting values of PHI nodes depend on the counter of the last 3391 // iteration in the vectorized loop. 3392 // If we come from a bypass edge then we need to start from the original 3393 // start value. 3394 for (auto &InductionEntry : Legal->getInductionVars()) { 3395 PHINode *OrigPhi = InductionEntry.first; 3396 InductionDescriptor II = InductionEntry.second; 3397 3398 // Create phi nodes to merge from the backedge-taken check block. 3399 PHINode *BCResumeVal = 3400 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3401 LoopScalarPreHeader->getTerminator()); 3402 // Copy original phi DL over to the new one. 3403 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3404 Value *&EndValue = IVEndValues[OrigPhi]; 3405 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3406 if (OrigPhi == OldInduction) { 3407 // We know what the end value is. 3408 EndValue = VectorTripCount; 3409 } else { 3410 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3411 Type *StepType = II.getStep()->getType(); 3412 Instruction::CastOps CastOp = 3413 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3414 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3415 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3416 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3417 EndValue->setName("ind.end"); 3418 3419 // Compute the end value for the additional bypass (if applicable). 3420 if (AdditionalBypass.first) { 3421 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3422 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3423 StepType, true); 3424 CRD = 3425 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3426 EndValueFromAdditionalBypass = 3427 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3428 EndValueFromAdditionalBypass->setName("ind.end"); 3429 } 3430 } 3431 // The new PHI merges the original incoming value, in case of a bypass, 3432 // or the value at the end of the vectorized loop. 3433 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3434 3435 // Fix the scalar body counter (PHI node). 3436 // The old induction's phi node in the scalar body needs the truncated 3437 // value. 3438 for (BasicBlock *BB : LoopBypassBlocks) 3439 BCResumeVal->addIncoming(II.getStartValue(), BB); 3440 3441 if (AdditionalBypass.first) 3442 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3443 EndValueFromAdditionalBypass); 3444 3445 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3446 } 3447 } 3448 3449 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3450 MDNode *OrigLoopID) { 3451 assert(L && "Expected valid loop."); 3452 3453 // The trip counts should be cached by now. 3454 Value *Count = getOrCreateTripCount(L); 3455 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3456 3457 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3458 3459 // Add a check in the middle block to see if we have completed 3460 // all of the iterations in the first vector loop. 3461 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3462 // If tail is to be folded, we know we don't need to run the remainder. 3463 if (!Cost->foldTailByMasking()) { 3464 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3465 Count, VectorTripCount, "cmp.n", 3466 LoopMiddleBlock->getTerminator()); 3467 3468 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3469 // of the corresponding compare because they may have ended up with 3470 // different line numbers and we want to avoid awkward line stepping while 3471 // debugging. Eg. if the compare has got a line number inside the loop. 3472 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3473 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3474 } 3475 3476 // Get ready to start creating new instructions into the vectorized body. 3477 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3478 "Inconsistent vector loop preheader"); 3479 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3480 3481 Optional<MDNode *> VectorizedLoopID = 3482 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3483 LLVMLoopVectorizeFollowupVectorized}); 3484 if (VectorizedLoopID.hasValue()) { 3485 L->setLoopID(VectorizedLoopID.getValue()); 3486 3487 // Do not setAlreadyVectorized if loop attributes have been defined 3488 // explicitly. 3489 return LoopVectorPreHeader; 3490 } 3491 3492 // Keep all loop hints from the original loop on the vector loop (we'll 3493 // replace the vectorizer-specific hints below). 3494 if (MDNode *LID = OrigLoop->getLoopID()) 3495 L->setLoopID(LID); 3496 3497 LoopVectorizeHints Hints(L, true, *ORE); 3498 Hints.setAlreadyVectorized(); 3499 3500 #ifdef EXPENSIVE_CHECKS 3501 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3502 LI->verify(*DT); 3503 #endif 3504 3505 return LoopVectorPreHeader; 3506 } 3507 3508 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3509 /* 3510 In this function we generate a new loop. The new loop will contain 3511 the vectorized instructions while the old loop will continue to run the 3512 scalar remainder. 3513 3514 [ ] <-- loop iteration number check. 3515 / | 3516 / v 3517 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3518 | / | 3519 | / v 3520 || [ ] <-- vector pre header. 3521 |/ | 3522 | v 3523 | [ ] \ 3524 | [ ]_| <-- vector loop. 3525 | | 3526 | v 3527 | -[ ] <--- middle-block. 3528 | / | 3529 | / v 3530 -|- >[ ] <--- new preheader. 3531 | | 3532 | v 3533 | [ ] \ 3534 | [ ]_| <-- old scalar loop to handle remainder. 3535 \ | 3536 \ v 3537 >[ ] <-- exit block. 3538 ... 3539 */ 3540 3541 // Get the metadata of the original loop before it gets modified. 3542 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3543 3544 // Create an empty vector loop, and prepare basic blocks for the runtime 3545 // checks. 3546 Loop *Lp = createVectorLoopSkeleton(""); 3547 3548 // Now, compare the new count to zero. If it is zero skip the vector loop and 3549 // jump to the scalar loop. This check also covers the case where the 3550 // backedge-taken count is uint##_max: adding one to it will overflow leading 3551 // to an incorrect trip count of zero. In this (rare) case we will also jump 3552 // to the scalar loop. 3553 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3554 3555 // Generate the code to check any assumptions that we've made for SCEV 3556 // expressions. 3557 emitSCEVChecks(Lp, LoopScalarPreHeader); 3558 3559 // Generate the code that checks in runtime if arrays overlap. We put the 3560 // checks into a separate block to make the more common case of few elements 3561 // faster. 3562 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3563 3564 // Some loops have a single integer induction variable, while other loops 3565 // don't. One example is c++ iterators that often have multiple pointer 3566 // induction variables. In the code below we also support a case where we 3567 // don't have a single induction variable. 3568 // 3569 // We try to obtain an induction variable from the original loop as hard 3570 // as possible. However if we don't find one that: 3571 // - is an integer 3572 // - counts from zero, stepping by one 3573 // - is the size of the widest induction variable type 3574 // then we create a new one. 3575 OldInduction = Legal->getPrimaryInduction(); 3576 Type *IdxTy = Legal->getWidestInductionType(); 3577 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3578 // The loop step is equal to the vectorization factor (num of SIMD elements) 3579 // times the unroll factor (num of SIMD instructions). 3580 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3581 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3582 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3583 Induction = 3584 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3585 getDebugLocFromInstOrOperands(OldInduction)); 3586 3587 // Emit phis for the new starting index of the scalar loop. 3588 createInductionResumeValues(Lp, CountRoundDown); 3589 3590 return completeLoopSkeleton(Lp, OrigLoopID); 3591 } 3592 3593 // Fix up external users of the induction variable. At this point, we are 3594 // in LCSSA form, with all external PHIs that use the IV having one input value, 3595 // coming from the remainder loop. We need those PHIs to also have a correct 3596 // value for the IV when arriving directly from the middle block. 3597 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3598 const InductionDescriptor &II, 3599 Value *CountRoundDown, Value *EndValue, 3600 BasicBlock *MiddleBlock) { 3601 // There are two kinds of external IV usages - those that use the value 3602 // computed in the last iteration (the PHI) and those that use the penultimate 3603 // value (the value that feeds into the phi from the loop latch). 3604 // We allow both, but they, obviously, have different values. 3605 3606 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3607 3608 DenseMap<Value *, Value *> MissingVals; 3609 3610 // An external user of the last iteration's value should see the value that 3611 // the remainder loop uses to initialize its own IV. 3612 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3613 for (User *U : PostInc->users()) { 3614 Instruction *UI = cast<Instruction>(U); 3615 if (!OrigLoop->contains(UI)) { 3616 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3617 MissingVals[UI] = EndValue; 3618 } 3619 } 3620 3621 // An external user of the penultimate value need to see EndValue - Step. 3622 // The simplest way to get this is to recompute it from the constituent SCEVs, 3623 // that is Start + (Step * (CRD - 1)). 3624 for (User *U : OrigPhi->users()) { 3625 auto *UI = cast<Instruction>(U); 3626 if (!OrigLoop->contains(UI)) { 3627 const DataLayout &DL = 3628 OrigLoop->getHeader()->getModule()->getDataLayout(); 3629 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3630 3631 IRBuilder<> B(MiddleBlock->getTerminator()); 3632 Value *CountMinusOne = B.CreateSub( 3633 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3634 Value *CMO = 3635 !II.getStep()->getType()->isIntegerTy() 3636 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3637 II.getStep()->getType()) 3638 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3639 CMO->setName("cast.cmo"); 3640 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3641 Escape->setName("ind.escape"); 3642 MissingVals[UI] = Escape; 3643 } 3644 } 3645 3646 for (auto &I : MissingVals) { 3647 PHINode *PHI = cast<PHINode>(I.first); 3648 // One corner case we have to handle is two IVs "chasing" each-other, 3649 // that is %IV2 = phi [...], [ %IV1, %latch ] 3650 // In this case, if IV1 has an external use, we need to avoid adding both 3651 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3652 // don't already have an incoming value for the middle block. 3653 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3654 PHI->addIncoming(I.second, MiddleBlock); 3655 } 3656 } 3657 3658 namespace { 3659 3660 struct CSEDenseMapInfo { 3661 static bool canHandle(const Instruction *I) { 3662 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3663 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3664 } 3665 3666 static inline Instruction *getEmptyKey() { 3667 return DenseMapInfo<Instruction *>::getEmptyKey(); 3668 } 3669 3670 static inline Instruction *getTombstoneKey() { 3671 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3672 } 3673 3674 static unsigned getHashValue(const Instruction *I) { 3675 assert(canHandle(I) && "Unknown instruction!"); 3676 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3677 I->value_op_end())); 3678 } 3679 3680 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3681 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3682 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3683 return LHS == RHS; 3684 return LHS->isIdenticalTo(RHS); 3685 } 3686 }; 3687 3688 } // end anonymous namespace 3689 3690 ///Perform cse of induction variable instructions. 3691 static void cse(BasicBlock *BB) { 3692 // Perform simple cse. 3693 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3694 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3695 Instruction *In = &*I++; 3696 3697 if (!CSEDenseMapInfo::canHandle(In)) 3698 continue; 3699 3700 // Check if we can replace this instruction with any of the 3701 // visited instructions. 3702 if (Instruction *V = CSEMap.lookup(In)) { 3703 In->replaceAllUsesWith(V); 3704 In->eraseFromParent(); 3705 continue; 3706 } 3707 3708 CSEMap[In] = In; 3709 } 3710 } 3711 3712 InstructionCost 3713 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3714 bool &NeedToScalarize) { 3715 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3716 Function *F = CI->getCalledFunction(); 3717 Type *ScalarRetTy = CI->getType(); 3718 SmallVector<Type *, 4> Tys, ScalarTys; 3719 for (auto &ArgOp : CI->arg_operands()) 3720 ScalarTys.push_back(ArgOp->getType()); 3721 3722 // Estimate cost of scalarized vector call. The source operands are assumed 3723 // to be vectors, so we need to extract individual elements from there, 3724 // execute VF scalar calls, and then gather the result into the vector return 3725 // value. 3726 InstructionCost ScalarCallCost = 3727 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3728 if (VF.isScalar()) 3729 return ScalarCallCost; 3730 3731 // Compute corresponding vector type for return value and arguments. 3732 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3733 for (Type *ScalarTy : ScalarTys) 3734 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3735 3736 // Compute costs of unpacking argument values for the scalar calls and 3737 // packing the return values to a vector. 3738 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3739 3740 InstructionCost Cost = 3741 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3742 3743 // If we can't emit a vector call for this function, then the currently found 3744 // cost is the cost we need to return. 3745 NeedToScalarize = true; 3746 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3747 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3748 3749 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3750 return Cost; 3751 3752 // If the corresponding vector cost is cheaper, return its cost. 3753 InstructionCost VectorCallCost = 3754 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3755 if (VectorCallCost < Cost) { 3756 NeedToScalarize = false; 3757 Cost = VectorCallCost; 3758 } 3759 return Cost; 3760 } 3761 3762 InstructionCost 3763 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3764 ElementCount VF) { 3765 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3766 assert(ID && "Expected intrinsic call!"); 3767 3768 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3769 return TTI.getIntrinsicInstrCost(CostAttrs, 3770 TargetTransformInfo::TCK_RecipThroughput); 3771 } 3772 3773 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3774 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3775 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3776 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3777 } 3778 3779 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3780 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3781 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3782 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3783 } 3784 3785 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3786 // For every instruction `I` in MinBWs, truncate the operands, create a 3787 // truncated version of `I` and reextend its result. InstCombine runs 3788 // later and will remove any ext/trunc pairs. 3789 SmallPtrSet<Value *, 4> Erased; 3790 for (const auto &KV : Cost->getMinimalBitwidths()) { 3791 // If the value wasn't vectorized, we must maintain the original scalar 3792 // type. The absence of the value from VectorLoopValueMap indicates that it 3793 // wasn't vectorized. 3794 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3795 continue; 3796 for (unsigned Part = 0; Part < UF; ++Part) { 3797 Value *I = getOrCreateVectorValue(KV.first, Part); 3798 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3799 continue; 3800 Type *OriginalTy = I->getType(); 3801 Type *ScalarTruncatedTy = 3802 IntegerType::get(OriginalTy->getContext(), KV.second); 3803 auto *TruncatedTy = FixedVectorType::get( 3804 ScalarTruncatedTy, 3805 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3806 if (TruncatedTy == OriginalTy) 3807 continue; 3808 3809 IRBuilder<> B(cast<Instruction>(I)); 3810 auto ShrinkOperand = [&](Value *V) -> Value * { 3811 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3812 if (ZI->getSrcTy() == TruncatedTy) 3813 return ZI->getOperand(0); 3814 return B.CreateZExtOrTrunc(V, TruncatedTy); 3815 }; 3816 3817 // The actual instruction modification depends on the instruction type, 3818 // unfortunately. 3819 Value *NewI = nullptr; 3820 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3821 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3822 ShrinkOperand(BO->getOperand(1))); 3823 3824 // Any wrapping introduced by shrinking this operation shouldn't be 3825 // considered undefined behavior. So, we can't unconditionally copy 3826 // arithmetic wrapping flags to NewI. 3827 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3828 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3829 NewI = 3830 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3831 ShrinkOperand(CI->getOperand(1))); 3832 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3833 NewI = B.CreateSelect(SI->getCondition(), 3834 ShrinkOperand(SI->getTrueValue()), 3835 ShrinkOperand(SI->getFalseValue())); 3836 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3837 switch (CI->getOpcode()) { 3838 default: 3839 llvm_unreachable("Unhandled cast!"); 3840 case Instruction::Trunc: 3841 NewI = ShrinkOperand(CI->getOperand(0)); 3842 break; 3843 case Instruction::SExt: 3844 NewI = B.CreateSExtOrTrunc( 3845 CI->getOperand(0), 3846 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3847 break; 3848 case Instruction::ZExt: 3849 NewI = B.CreateZExtOrTrunc( 3850 CI->getOperand(0), 3851 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3852 break; 3853 } 3854 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3855 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3856 ->getNumElements(); 3857 auto *O0 = B.CreateZExtOrTrunc( 3858 SI->getOperand(0), 3859 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3860 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3861 ->getNumElements(); 3862 auto *O1 = B.CreateZExtOrTrunc( 3863 SI->getOperand(1), 3864 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3865 3866 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3867 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3868 // Don't do anything with the operands, just extend the result. 3869 continue; 3870 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3871 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3872 ->getNumElements(); 3873 auto *O0 = B.CreateZExtOrTrunc( 3874 IE->getOperand(0), 3875 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3876 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3877 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3878 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3879 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3880 ->getNumElements(); 3881 auto *O0 = B.CreateZExtOrTrunc( 3882 EE->getOperand(0), 3883 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3884 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3885 } else { 3886 // If we don't know what to do, be conservative and don't do anything. 3887 continue; 3888 } 3889 3890 // Lastly, extend the result. 3891 NewI->takeName(cast<Instruction>(I)); 3892 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3893 I->replaceAllUsesWith(Res); 3894 cast<Instruction>(I)->eraseFromParent(); 3895 Erased.insert(I); 3896 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3897 } 3898 } 3899 3900 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3901 for (const auto &KV : Cost->getMinimalBitwidths()) { 3902 // If the value wasn't vectorized, we must maintain the original scalar 3903 // type. The absence of the value from VectorLoopValueMap indicates that it 3904 // wasn't vectorized. 3905 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3906 continue; 3907 for (unsigned Part = 0; Part < UF; ++Part) { 3908 Value *I = getOrCreateVectorValue(KV.first, Part); 3909 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3910 if (Inst && Inst->use_empty()) { 3911 Value *NewI = Inst->getOperand(0); 3912 Inst->eraseFromParent(); 3913 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3914 } 3915 } 3916 } 3917 } 3918 3919 void InnerLoopVectorizer::fixVectorizedLoop() { 3920 // Insert truncates and extends for any truncated instructions as hints to 3921 // InstCombine. 3922 if (VF.isVector()) 3923 truncateToMinimalBitwidths(); 3924 3925 // Fix widened non-induction PHIs by setting up the PHI operands. 3926 if (OrigPHIsToFix.size()) { 3927 assert(EnableVPlanNativePath && 3928 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3929 fixNonInductionPHIs(); 3930 } 3931 3932 // At this point every instruction in the original loop is widened to a 3933 // vector form. Now we need to fix the recurrences in the loop. These PHI 3934 // nodes are currently empty because we did not want to introduce cycles. 3935 // This is the second stage of vectorizing recurrences. 3936 fixCrossIterationPHIs(); 3937 3938 // Forget the original basic block. 3939 PSE.getSE()->forgetLoop(OrigLoop); 3940 3941 // Fix-up external users of the induction variables. 3942 for (auto &Entry : Legal->getInductionVars()) 3943 fixupIVUsers(Entry.first, Entry.second, 3944 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3945 IVEndValues[Entry.first], LoopMiddleBlock); 3946 3947 fixLCSSAPHIs(); 3948 for (Instruction *PI : PredicatedInstructions) 3949 sinkScalarOperands(&*PI); 3950 3951 // Remove redundant induction instructions. 3952 cse(LoopVectorBody); 3953 3954 // Set/update profile weights for the vector and remainder loops as original 3955 // loop iterations are now distributed among them. Note that original loop 3956 // represented by LoopScalarBody becomes remainder loop after vectorization. 3957 // 3958 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3959 // end up getting slightly roughened result but that should be OK since 3960 // profile is not inherently precise anyway. Note also possible bypass of 3961 // vector code caused by legality checks is ignored, assigning all the weight 3962 // to the vector loop, optimistically. 3963 // 3964 // For scalable vectorization we can't know at compile time how many iterations 3965 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3966 // vscale of '1'. 3967 setProfileInfoAfterUnrolling( 3968 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3969 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3970 } 3971 3972 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3973 // In order to support recurrences we need to be able to vectorize Phi nodes. 3974 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3975 // stage #2: We now need to fix the recurrences by adding incoming edges to 3976 // the currently empty PHI nodes. At this point every instruction in the 3977 // original loop is widened to a vector form so we can use them to construct 3978 // the incoming edges. 3979 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3980 // Handle first-order recurrences and reductions that need to be fixed. 3981 if (Legal->isFirstOrderRecurrence(&Phi)) 3982 fixFirstOrderRecurrence(&Phi); 3983 else if (Legal->isReductionVariable(&Phi)) 3984 fixReduction(&Phi); 3985 } 3986 } 3987 3988 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3989 // This is the second phase of vectorizing first-order recurrences. An 3990 // overview of the transformation is described below. Suppose we have the 3991 // following loop. 3992 // 3993 // for (int i = 0; i < n; ++i) 3994 // b[i] = a[i] - a[i - 1]; 3995 // 3996 // There is a first-order recurrence on "a". For this loop, the shorthand 3997 // scalar IR looks like: 3998 // 3999 // scalar.ph: 4000 // s_init = a[-1] 4001 // br scalar.body 4002 // 4003 // scalar.body: 4004 // i = phi [0, scalar.ph], [i+1, scalar.body] 4005 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4006 // s2 = a[i] 4007 // b[i] = s2 - s1 4008 // br cond, scalar.body, ... 4009 // 4010 // In this example, s1 is a recurrence because it's value depends on the 4011 // previous iteration. In the first phase of vectorization, we created a 4012 // temporary value for s1. We now complete the vectorization and produce the 4013 // shorthand vector IR shown below (for VF = 4, UF = 1). 4014 // 4015 // vector.ph: 4016 // v_init = vector(..., ..., ..., a[-1]) 4017 // br vector.body 4018 // 4019 // vector.body 4020 // i = phi [0, vector.ph], [i+4, vector.body] 4021 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4022 // v2 = a[i, i+1, i+2, i+3]; 4023 // v3 = vector(v1(3), v2(0, 1, 2)) 4024 // b[i, i+1, i+2, i+3] = v2 - v3 4025 // br cond, vector.body, middle.block 4026 // 4027 // middle.block: 4028 // x = v2(3) 4029 // br scalar.ph 4030 // 4031 // scalar.ph: 4032 // s_init = phi [x, middle.block], [a[-1], otherwise] 4033 // br scalar.body 4034 // 4035 // After execution completes the vector loop, we extract the next value of 4036 // the recurrence (x) to use as the initial value in the scalar loop. 4037 4038 // Get the original loop preheader and single loop latch. 4039 auto *Preheader = OrigLoop->getLoopPreheader(); 4040 auto *Latch = OrigLoop->getLoopLatch(); 4041 4042 // Get the initial and previous values of the scalar recurrence. 4043 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4044 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4045 4046 // Create a vector from the initial value. 4047 auto *VectorInit = ScalarInit; 4048 if (VF.isVector()) { 4049 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4050 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4051 VectorInit = Builder.CreateInsertElement( 4052 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4053 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4054 } 4055 4056 // We constructed a temporary phi node in the first phase of vectorization. 4057 // This phi node will eventually be deleted. 4058 Builder.SetInsertPoint( 4059 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4060 4061 // Create a phi node for the new recurrence. The current value will either be 4062 // the initial value inserted into a vector or loop-varying vector value. 4063 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4064 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4065 4066 // Get the vectorized previous value of the last part UF - 1. It appears last 4067 // among all unrolled iterations, due to the order of their construction. 4068 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4069 4070 // Find and set the insertion point after the previous value if it is an 4071 // instruction. 4072 BasicBlock::iterator InsertPt; 4073 // Note that the previous value may have been constant-folded so it is not 4074 // guaranteed to be an instruction in the vector loop. 4075 // FIXME: Loop invariant values do not form recurrences. We should deal with 4076 // them earlier. 4077 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4078 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4079 else { 4080 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4081 if (isa<PHINode>(PreviousLastPart)) 4082 // If the previous value is a phi node, we should insert after all the phi 4083 // nodes in the block containing the PHI to avoid breaking basic block 4084 // verification. Note that the basic block may be different to 4085 // LoopVectorBody, in case we predicate the loop. 4086 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4087 else 4088 InsertPt = ++PreviousInst->getIterator(); 4089 } 4090 Builder.SetInsertPoint(&*InsertPt); 4091 4092 // We will construct a vector for the recurrence by combining the values for 4093 // the current and previous iterations. This is the required shuffle mask. 4094 assert(!VF.isScalable()); 4095 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4096 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4097 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4098 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4099 4100 // The vector from which to take the initial value for the current iteration 4101 // (actual or unrolled). Initially, this is the vector phi node. 4102 Value *Incoming = VecPhi; 4103 4104 // Shuffle the current and previous vector and update the vector parts. 4105 for (unsigned Part = 0; Part < UF; ++Part) { 4106 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4107 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4108 auto *Shuffle = 4109 VF.isVector() 4110 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4111 : Incoming; 4112 PhiPart->replaceAllUsesWith(Shuffle); 4113 cast<Instruction>(PhiPart)->eraseFromParent(); 4114 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4115 Incoming = PreviousPart; 4116 } 4117 4118 // Fix the latch value of the new recurrence in the vector loop. 4119 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4120 4121 // Extract the last vector element in the middle block. This will be the 4122 // initial value for the recurrence when jumping to the scalar loop. 4123 auto *ExtractForScalar = Incoming; 4124 if (VF.isVector()) { 4125 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4126 ExtractForScalar = Builder.CreateExtractElement( 4127 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4128 "vector.recur.extract"); 4129 } 4130 // Extract the second last element in the middle block if the 4131 // Phi is used outside the loop. We need to extract the phi itself 4132 // and not the last element (the phi update in the current iteration). This 4133 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4134 // when the scalar loop is not run at all. 4135 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4136 if (VF.isVector()) 4137 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4138 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4139 "vector.recur.extract.for.phi"); 4140 // When loop is unrolled without vectorizing, initialize 4141 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4142 // `Incoming`. This is analogous to the vectorized case above: extracting the 4143 // second last element when VF > 1. 4144 else if (UF > 1) 4145 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4146 4147 // Fix the initial value of the original recurrence in the scalar loop. 4148 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4149 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4150 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4151 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4152 Start->addIncoming(Incoming, BB); 4153 } 4154 4155 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4156 Phi->setName("scalar.recur"); 4157 4158 // Finally, fix users of the recurrence outside the loop. The users will need 4159 // either the last value of the scalar recurrence or the last value of the 4160 // vector recurrence we extracted in the middle block. Since the loop is in 4161 // LCSSA form, we just need to find all the phi nodes for the original scalar 4162 // recurrence in the exit block, and then add an edge for the middle block. 4163 // Note that LCSSA does not imply single entry when the original scalar loop 4164 // had multiple exiting edges (as we always run the last iteration in the 4165 // scalar epilogue); in that case, the exiting path through middle will be 4166 // dynamically dead and the value picked for the phi doesn't matter. 4167 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4168 if (any_of(LCSSAPhi.incoming_values(), 4169 [Phi](Value *V) { return V == Phi; })) 4170 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4171 } 4172 4173 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4174 // Get it's reduction variable descriptor. 4175 assert(Legal->isReductionVariable(Phi) && 4176 "Unable to find the reduction variable"); 4177 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4178 4179 RecurKind RK = RdxDesc.getRecurrenceKind(); 4180 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4181 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4182 setDebugLocFromInst(Builder, ReductionStartValue); 4183 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4184 4185 // This is the vector-clone of the value that leaves the loop. 4186 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4187 4188 // Wrap flags are in general invalid after vectorization, clear them. 4189 clearReductionWrapFlags(RdxDesc); 4190 4191 // Fix the vector-loop phi. 4192 4193 // Reductions do not have to start at zero. They can start with 4194 // any loop invariant values. 4195 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4196 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4197 4198 for (unsigned Part = 0; Part < UF; ++Part) { 4199 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4200 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4201 cast<PHINode>(VecRdxPhi) 4202 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4203 } 4204 4205 // Before each round, move the insertion point right between 4206 // the PHIs and the values we are going to write. 4207 // This allows us to write both PHINodes and the extractelement 4208 // instructions. 4209 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4210 4211 setDebugLocFromInst(Builder, LoopExitInst); 4212 4213 // If tail is folded by masking, the vector value to leave the loop should be 4214 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4215 // instead of the former. For an inloop reduction the reduction will already 4216 // be predicated, and does not need to be handled here. 4217 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4218 for (unsigned Part = 0; Part < UF; ++Part) { 4219 Value *VecLoopExitInst = 4220 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4221 Value *Sel = nullptr; 4222 for (User *U : VecLoopExitInst->users()) { 4223 if (isa<SelectInst>(U)) { 4224 assert(!Sel && "Reduction exit feeding two selects"); 4225 Sel = U; 4226 } else 4227 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4228 } 4229 assert(Sel && "Reduction exit feeds no select"); 4230 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4231 4232 // If the target can create a predicated operator for the reduction at no 4233 // extra cost in the loop (for example a predicated vadd), it can be 4234 // cheaper for the select to remain in the loop than be sunk out of it, 4235 // and so use the select value for the phi instead of the old 4236 // LoopExitValue. 4237 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4238 if (PreferPredicatedReductionSelect || 4239 TTI->preferPredicatedReductionSelect( 4240 RdxDesc.getOpcode(), Phi->getType(), 4241 TargetTransformInfo::ReductionFlags())) { 4242 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4243 VecRdxPhi->setIncomingValueForBlock( 4244 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4245 } 4246 } 4247 } 4248 4249 // If the vector reduction can be performed in a smaller type, we truncate 4250 // then extend the loop exit value to enable InstCombine to evaluate the 4251 // entire expression in the smaller type. 4252 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4253 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4254 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4255 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4256 Builder.SetInsertPoint( 4257 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4258 VectorParts RdxParts(UF); 4259 for (unsigned Part = 0; Part < UF; ++Part) { 4260 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4261 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4262 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4263 : Builder.CreateZExt(Trunc, VecTy); 4264 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4265 UI != RdxParts[Part]->user_end();) 4266 if (*UI != Trunc) { 4267 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4268 RdxParts[Part] = Extnd; 4269 } else { 4270 ++UI; 4271 } 4272 } 4273 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4274 for (unsigned Part = 0; Part < UF; ++Part) { 4275 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4276 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4277 } 4278 } 4279 4280 // Reduce all of the unrolled parts into a single vector. 4281 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4282 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4283 4284 // The middle block terminator has already been assigned a DebugLoc here (the 4285 // OrigLoop's single latch terminator). We want the whole middle block to 4286 // appear to execute on this line because: (a) it is all compiler generated, 4287 // (b) these instructions are always executed after evaluating the latch 4288 // conditional branch, and (c) other passes may add new predecessors which 4289 // terminate on this line. This is the easiest way to ensure we don't 4290 // accidentally cause an extra step back into the loop while debugging. 4291 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4292 for (unsigned Part = 1; Part < UF; ++Part) { 4293 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4294 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4295 // Floating point operations had to be 'fast' to enable the reduction. 4296 ReducedPartRdx = addFastMathFlag( 4297 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4298 ReducedPartRdx, "bin.rdx"), 4299 RdxDesc.getFastMathFlags()); 4300 else 4301 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4302 } 4303 4304 // Create the reduction after the loop. Note that inloop reductions create the 4305 // target reduction in the loop using a Reduction recipe. 4306 if (VF.isVector() && !IsInLoopReductionPhi) { 4307 ReducedPartRdx = 4308 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4309 // If the reduction can be performed in a smaller type, we need to extend 4310 // the reduction to the wider type before we branch to the original loop. 4311 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4312 ReducedPartRdx = 4313 RdxDesc.isSigned() 4314 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4315 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4316 } 4317 4318 // Create a phi node that merges control-flow from the backedge-taken check 4319 // block and the middle block. 4320 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4321 LoopScalarPreHeader->getTerminator()); 4322 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4323 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4324 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4325 4326 // Now, we need to fix the users of the reduction variable 4327 // inside and outside of the scalar remainder loop. 4328 4329 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4330 // in the exit blocks. See comment on analogous loop in 4331 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4332 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4333 if (any_of(LCSSAPhi.incoming_values(), 4334 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4335 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4336 4337 // Fix the scalar loop reduction variable with the incoming reduction sum 4338 // from the vector body and from the backedge value. 4339 int IncomingEdgeBlockIdx = 4340 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4341 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4342 // Pick the other block. 4343 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4344 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4345 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4346 } 4347 4348 void InnerLoopVectorizer::clearReductionWrapFlags( 4349 RecurrenceDescriptor &RdxDesc) { 4350 RecurKind RK = RdxDesc.getRecurrenceKind(); 4351 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4352 return; 4353 4354 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4355 assert(LoopExitInstr && "null loop exit instruction"); 4356 SmallVector<Instruction *, 8> Worklist; 4357 SmallPtrSet<Instruction *, 8> Visited; 4358 Worklist.push_back(LoopExitInstr); 4359 Visited.insert(LoopExitInstr); 4360 4361 while (!Worklist.empty()) { 4362 Instruction *Cur = Worklist.pop_back_val(); 4363 if (isa<OverflowingBinaryOperator>(Cur)) 4364 for (unsigned Part = 0; Part < UF; ++Part) { 4365 Value *V = getOrCreateVectorValue(Cur, Part); 4366 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4367 } 4368 4369 for (User *U : Cur->users()) { 4370 Instruction *UI = cast<Instruction>(U); 4371 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4372 Visited.insert(UI).second) 4373 Worklist.push_back(UI); 4374 } 4375 } 4376 } 4377 4378 void InnerLoopVectorizer::fixLCSSAPHIs() { 4379 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4380 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4381 // Some phis were already hand updated by the reduction and recurrence 4382 // code above, leave them alone. 4383 continue; 4384 4385 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4386 // Non-instruction incoming values will have only one value. 4387 unsigned LastLane = 0; 4388 if (isa<Instruction>(IncomingValue)) 4389 LastLane = Cost->isUniformAfterVectorization( 4390 cast<Instruction>(IncomingValue), VF) 4391 ? 0 4392 : VF.getKnownMinValue() - 1; 4393 assert((!VF.isScalable() || LastLane == 0) && 4394 "scalable vectors dont support non-uniform scalars yet"); 4395 // Can be a loop invariant incoming value or the last scalar value to be 4396 // extracted from the vectorized loop. 4397 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4398 Value *lastIncomingValue = 4399 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4400 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4401 } 4402 } 4403 4404 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4405 // The basic block and loop containing the predicated instruction. 4406 auto *PredBB = PredInst->getParent(); 4407 auto *VectorLoop = LI->getLoopFor(PredBB); 4408 4409 // Initialize a worklist with the operands of the predicated instruction. 4410 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4411 4412 // Holds instructions that we need to analyze again. An instruction may be 4413 // reanalyzed if we don't yet know if we can sink it or not. 4414 SmallVector<Instruction *, 8> InstsToReanalyze; 4415 4416 // Returns true if a given use occurs in the predicated block. Phi nodes use 4417 // their operands in their corresponding predecessor blocks. 4418 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4419 auto *I = cast<Instruction>(U.getUser()); 4420 BasicBlock *BB = I->getParent(); 4421 if (auto *Phi = dyn_cast<PHINode>(I)) 4422 BB = Phi->getIncomingBlock( 4423 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4424 return BB == PredBB; 4425 }; 4426 4427 // Iteratively sink the scalarized operands of the predicated instruction 4428 // into the block we created for it. When an instruction is sunk, it's 4429 // operands are then added to the worklist. The algorithm ends after one pass 4430 // through the worklist doesn't sink a single instruction. 4431 bool Changed; 4432 do { 4433 // Add the instructions that need to be reanalyzed to the worklist, and 4434 // reset the changed indicator. 4435 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4436 InstsToReanalyze.clear(); 4437 Changed = false; 4438 4439 while (!Worklist.empty()) { 4440 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4441 4442 // We can't sink an instruction if it is a phi node, is already in the 4443 // predicated block, is not in the loop, or may have side effects. 4444 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4445 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4446 continue; 4447 4448 // It's legal to sink the instruction if all its uses occur in the 4449 // predicated block. Otherwise, there's nothing to do yet, and we may 4450 // need to reanalyze the instruction. 4451 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4452 InstsToReanalyze.push_back(I); 4453 continue; 4454 } 4455 4456 // Move the instruction to the beginning of the predicated block, and add 4457 // it's operands to the worklist. 4458 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4459 Worklist.insert(I->op_begin(), I->op_end()); 4460 4461 // The sinking may have enabled other instructions to be sunk, so we will 4462 // need to iterate. 4463 Changed = true; 4464 } 4465 } while (Changed); 4466 } 4467 4468 void InnerLoopVectorizer::fixNonInductionPHIs() { 4469 for (PHINode *OrigPhi : OrigPHIsToFix) { 4470 PHINode *NewPhi = 4471 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4472 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4473 4474 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4475 predecessors(OrigPhi->getParent())); 4476 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4477 predecessors(NewPhi->getParent())); 4478 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4479 "Scalar and Vector BB should have the same number of predecessors"); 4480 4481 // The insertion point in Builder may be invalidated by the time we get 4482 // here. Force the Builder insertion point to something valid so that we do 4483 // not run into issues during insertion point restore in 4484 // getOrCreateVectorValue calls below. 4485 Builder.SetInsertPoint(NewPhi); 4486 4487 // The predecessor order is preserved and we can rely on mapping between 4488 // scalar and vector block predecessors. 4489 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4490 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4491 4492 // When looking up the new scalar/vector values to fix up, use incoming 4493 // values from original phi. 4494 Value *ScIncV = 4495 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4496 4497 // Scalar incoming value may need a broadcast 4498 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4499 NewPhi->addIncoming(NewIncV, NewPredBB); 4500 } 4501 } 4502 } 4503 4504 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4505 VPUser &Operands, unsigned UF, 4506 ElementCount VF, bool IsPtrLoopInvariant, 4507 SmallBitVector &IsIndexLoopInvariant, 4508 VPTransformState &State) { 4509 // Construct a vector GEP by widening the operands of the scalar GEP as 4510 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4511 // results in a vector of pointers when at least one operand of the GEP 4512 // is vector-typed. Thus, to keep the representation compact, we only use 4513 // vector-typed operands for loop-varying values. 4514 4515 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4516 // If we are vectorizing, but the GEP has only loop-invariant operands, 4517 // the GEP we build (by only using vector-typed operands for 4518 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4519 // produce a vector of pointers, we need to either arbitrarily pick an 4520 // operand to broadcast, or broadcast a clone of the original GEP. 4521 // Here, we broadcast a clone of the original. 4522 // 4523 // TODO: If at some point we decide to scalarize instructions having 4524 // loop-invariant operands, this special case will no longer be 4525 // required. We would add the scalarization decision to 4526 // collectLoopScalars() and teach getVectorValue() to broadcast 4527 // the lane-zero scalar value. 4528 auto *Clone = Builder.Insert(GEP->clone()); 4529 for (unsigned Part = 0; Part < UF; ++Part) { 4530 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4531 State.set(VPDef, GEP, EntryPart, Part); 4532 addMetadata(EntryPart, GEP); 4533 } 4534 } else { 4535 // If the GEP has at least one loop-varying operand, we are sure to 4536 // produce a vector of pointers. But if we are only unrolling, we want 4537 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4538 // produce with the code below will be scalar (if VF == 1) or vector 4539 // (otherwise). Note that for the unroll-only case, we still maintain 4540 // values in the vector mapping with initVector, as we do for other 4541 // instructions. 4542 for (unsigned Part = 0; Part < UF; ++Part) { 4543 // The pointer operand of the new GEP. If it's loop-invariant, we 4544 // won't broadcast it. 4545 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4546 : State.get(Operands.getOperand(0), Part); 4547 4548 // Collect all the indices for the new GEP. If any index is 4549 // loop-invariant, we won't broadcast it. 4550 SmallVector<Value *, 4> Indices; 4551 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4552 VPValue *Operand = Operands.getOperand(I); 4553 if (IsIndexLoopInvariant[I - 1]) 4554 Indices.push_back(State.get(Operand, {0, 0})); 4555 else 4556 Indices.push_back(State.get(Operand, Part)); 4557 } 4558 4559 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4560 // but it should be a vector, otherwise. 4561 auto *NewGEP = 4562 GEP->isInBounds() 4563 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4564 Indices) 4565 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4566 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4567 "NewGEP is not a pointer vector"); 4568 State.set(VPDef, GEP, NewGEP, Part); 4569 addMetadata(NewGEP, GEP); 4570 } 4571 } 4572 } 4573 4574 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4575 RecurrenceDescriptor *RdxDesc, 4576 Value *StartV, unsigned UF, 4577 ElementCount VF) { 4578 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4579 PHINode *P = cast<PHINode>(PN); 4580 if (EnableVPlanNativePath) { 4581 // Currently we enter here in the VPlan-native path for non-induction 4582 // PHIs where all control flow is uniform. We simply widen these PHIs. 4583 // Create a vector phi with no operands - the vector phi operands will be 4584 // set at the end of vector code generation. 4585 Type *VecTy = 4586 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4587 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4588 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4589 OrigPHIsToFix.push_back(P); 4590 4591 return; 4592 } 4593 4594 assert(PN->getParent() == OrigLoop->getHeader() && 4595 "Non-header phis should have been handled elsewhere"); 4596 4597 // In order to support recurrences we need to be able to vectorize Phi nodes. 4598 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4599 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4600 // this value when we vectorize all of the instructions that use the PHI. 4601 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4602 Value *Iden = nullptr; 4603 bool ScalarPHI = 4604 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4605 Type *VecTy = 4606 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4607 4608 if (RdxDesc) { 4609 assert(Legal->isReductionVariable(P) && StartV && 4610 "RdxDesc should only be set for reduction variables; in that case " 4611 "a StartV is also required"); 4612 RecurKind RK = RdxDesc->getRecurrenceKind(); 4613 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4614 // MinMax reduction have the start value as their identify. 4615 if (ScalarPHI) { 4616 Iden = StartV; 4617 } else { 4618 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4619 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4620 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4621 } 4622 } else { 4623 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4624 RK, VecTy->getScalarType()); 4625 Iden = IdenC; 4626 4627 if (!ScalarPHI) { 4628 Iden = ConstantVector::getSplat(VF, IdenC); 4629 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4630 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4631 Constant *Zero = Builder.getInt32(0); 4632 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4633 } 4634 } 4635 } 4636 4637 for (unsigned Part = 0; Part < UF; ++Part) { 4638 // This is phase one of vectorizing PHIs. 4639 Value *EntryPart = PHINode::Create( 4640 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4641 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4642 if (StartV) { 4643 // Make sure to add the reduction start value only to the 4644 // first unroll part. 4645 Value *StartVal = (Part == 0) ? StartV : Iden; 4646 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4647 } 4648 } 4649 return; 4650 } 4651 4652 assert(!Legal->isReductionVariable(P) && 4653 "reductions should be handled above"); 4654 4655 setDebugLocFromInst(Builder, P); 4656 4657 // This PHINode must be an induction variable. 4658 // Make sure that we know about it. 4659 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4660 4661 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4662 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4663 4664 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4665 // which can be found from the original scalar operations. 4666 switch (II.getKind()) { 4667 case InductionDescriptor::IK_NoInduction: 4668 llvm_unreachable("Unknown induction"); 4669 case InductionDescriptor::IK_IntInduction: 4670 case InductionDescriptor::IK_FpInduction: 4671 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4672 case InductionDescriptor::IK_PtrInduction: { 4673 // Handle the pointer induction variable case. 4674 assert(P->getType()->isPointerTy() && "Unexpected type."); 4675 4676 if (Cost->isScalarAfterVectorization(P, VF)) { 4677 // This is the normalized GEP that starts counting at zero. 4678 Value *PtrInd = 4679 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4680 // Determine the number of scalars we need to generate for each unroll 4681 // iteration. If the instruction is uniform, we only need to generate the 4682 // first lane. Otherwise, we generate all VF values. 4683 unsigned Lanes = 4684 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4685 for (unsigned Part = 0; Part < UF; ++Part) { 4686 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4687 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4688 Lane + Part * VF.getKnownMinValue()); 4689 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4690 Value *SclrGep = 4691 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4692 SclrGep->setName("next.gep"); 4693 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4694 } 4695 } 4696 return; 4697 } 4698 assert(isa<SCEVConstant>(II.getStep()) && 4699 "Induction step not a SCEV constant!"); 4700 Type *PhiType = II.getStep()->getType(); 4701 4702 // Build a pointer phi 4703 Value *ScalarStartValue = II.getStartValue(); 4704 Type *ScStValueType = ScalarStartValue->getType(); 4705 PHINode *NewPointerPhi = 4706 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4707 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4708 4709 // A pointer induction, performed by using a gep 4710 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4711 Instruction *InductionLoc = LoopLatch->getTerminator(); 4712 const SCEV *ScalarStep = II.getStep(); 4713 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4714 Value *ScalarStepValue = 4715 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4716 Value *InductionGEP = GetElementPtrInst::Create( 4717 ScStValueType->getPointerElementType(), NewPointerPhi, 4718 Builder.CreateMul( 4719 ScalarStepValue, 4720 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4721 "ptr.ind", InductionLoc); 4722 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4723 4724 // Create UF many actual address geps that use the pointer 4725 // phi as base and a vectorized version of the step value 4726 // (<step*0, ..., step*N>) as offset. 4727 for (unsigned Part = 0; Part < UF; ++Part) { 4728 SmallVector<Constant *, 8> Indices; 4729 // Create a vector of consecutive numbers from zero to VF. 4730 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4731 Indices.push_back( 4732 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4733 Constant *StartOffset = ConstantVector::get(Indices); 4734 4735 Value *GEP = Builder.CreateGEP( 4736 ScStValueType->getPointerElementType(), NewPointerPhi, 4737 Builder.CreateMul( 4738 StartOffset, 4739 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4740 "vector.gep")); 4741 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4742 } 4743 } 4744 } 4745 } 4746 4747 /// A helper function for checking whether an integer division-related 4748 /// instruction may divide by zero (in which case it must be predicated if 4749 /// executed conditionally in the scalar code). 4750 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4751 /// Non-zero divisors that are non compile-time constants will not be 4752 /// converted into multiplication, so we will still end up scalarizing 4753 /// the division, but can do so w/o predication. 4754 static bool mayDivideByZero(Instruction &I) { 4755 assert((I.getOpcode() == Instruction::UDiv || 4756 I.getOpcode() == Instruction::SDiv || 4757 I.getOpcode() == Instruction::URem || 4758 I.getOpcode() == Instruction::SRem) && 4759 "Unexpected instruction"); 4760 Value *Divisor = I.getOperand(1); 4761 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4762 return !CInt || CInt->isZero(); 4763 } 4764 4765 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4766 VPUser &User, 4767 VPTransformState &State) { 4768 switch (I.getOpcode()) { 4769 case Instruction::Call: 4770 case Instruction::Br: 4771 case Instruction::PHI: 4772 case Instruction::GetElementPtr: 4773 case Instruction::Select: 4774 llvm_unreachable("This instruction is handled by a different recipe."); 4775 case Instruction::UDiv: 4776 case Instruction::SDiv: 4777 case Instruction::SRem: 4778 case Instruction::URem: 4779 case Instruction::Add: 4780 case Instruction::FAdd: 4781 case Instruction::Sub: 4782 case Instruction::FSub: 4783 case Instruction::FNeg: 4784 case Instruction::Mul: 4785 case Instruction::FMul: 4786 case Instruction::FDiv: 4787 case Instruction::FRem: 4788 case Instruction::Shl: 4789 case Instruction::LShr: 4790 case Instruction::AShr: 4791 case Instruction::And: 4792 case Instruction::Or: 4793 case Instruction::Xor: { 4794 // Just widen unops and binops. 4795 setDebugLocFromInst(Builder, &I); 4796 4797 for (unsigned Part = 0; Part < UF; ++Part) { 4798 SmallVector<Value *, 2> Ops; 4799 for (VPValue *VPOp : User.operands()) 4800 Ops.push_back(State.get(VPOp, Part)); 4801 4802 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4803 4804 if (auto *VecOp = dyn_cast<Instruction>(V)) 4805 VecOp->copyIRFlags(&I); 4806 4807 // Use this vector value for all users of the original instruction. 4808 State.set(Def, &I, V, Part); 4809 addMetadata(V, &I); 4810 } 4811 4812 break; 4813 } 4814 case Instruction::ICmp: 4815 case Instruction::FCmp: { 4816 // Widen compares. Generate vector compares. 4817 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4818 auto *Cmp = cast<CmpInst>(&I); 4819 setDebugLocFromInst(Builder, Cmp); 4820 for (unsigned Part = 0; Part < UF; ++Part) { 4821 Value *A = State.get(User.getOperand(0), Part); 4822 Value *B = State.get(User.getOperand(1), Part); 4823 Value *C = nullptr; 4824 if (FCmp) { 4825 // Propagate fast math flags. 4826 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4827 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4828 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4829 } else { 4830 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4831 } 4832 State.set(Def, &I, C, Part); 4833 addMetadata(C, &I); 4834 } 4835 4836 break; 4837 } 4838 4839 case Instruction::ZExt: 4840 case Instruction::SExt: 4841 case Instruction::FPToUI: 4842 case Instruction::FPToSI: 4843 case Instruction::FPExt: 4844 case Instruction::PtrToInt: 4845 case Instruction::IntToPtr: 4846 case Instruction::SIToFP: 4847 case Instruction::UIToFP: 4848 case Instruction::Trunc: 4849 case Instruction::FPTrunc: 4850 case Instruction::BitCast: { 4851 auto *CI = cast<CastInst>(&I); 4852 setDebugLocFromInst(Builder, CI); 4853 4854 /// Vectorize casts. 4855 Type *DestTy = 4856 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4857 4858 for (unsigned Part = 0; Part < UF; ++Part) { 4859 Value *A = State.get(User.getOperand(0), Part); 4860 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4861 State.set(Def, &I, Cast, Part); 4862 addMetadata(Cast, &I); 4863 } 4864 break; 4865 } 4866 default: 4867 // This instruction is not vectorized by simple widening. 4868 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4869 llvm_unreachable("Unhandled instruction!"); 4870 } // end of switch. 4871 } 4872 4873 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4874 VPUser &ArgOperands, 4875 VPTransformState &State) { 4876 assert(!isa<DbgInfoIntrinsic>(I) && 4877 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4878 setDebugLocFromInst(Builder, &I); 4879 4880 Module *M = I.getParent()->getParent()->getParent(); 4881 auto *CI = cast<CallInst>(&I); 4882 4883 SmallVector<Type *, 4> Tys; 4884 for (Value *ArgOperand : CI->arg_operands()) 4885 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4886 4887 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4888 4889 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4890 // version of the instruction. 4891 // Is it beneficial to perform intrinsic call compared to lib call? 4892 bool NeedToScalarize = false; 4893 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4894 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4895 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4896 assert((UseVectorIntrinsic || !NeedToScalarize) && 4897 "Instruction should be scalarized elsewhere."); 4898 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4899 "Cannot have invalid costs while widening"); 4900 4901 for (unsigned Part = 0; Part < UF; ++Part) { 4902 SmallVector<Value *, 4> Args; 4903 for (auto &I : enumerate(ArgOperands.operands())) { 4904 // Some intrinsics have a scalar argument - don't replace it with a 4905 // vector. 4906 Value *Arg; 4907 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4908 Arg = State.get(I.value(), Part); 4909 else 4910 Arg = State.get(I.value(), {0, 0}); 4911 Args.push_back(Arg); 4912 } 4913 4914 Function *VectorF; 4915 if (UseVectorIntrinsic) { 4916 // Use vector version of the intrinsic. 4917 Type *TysForDecl[] = {CI->getType()}; 4918 if (VF.isVector()) { 4919 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4920 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4921 } 4922 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4923 assert(VectorF && "Can't retrieve vector intrinsic."); 4924 } else { 4925 // Use vector version of the function call. 4926 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4927 #ifndef NDEBUG 4928 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4929 "Can't create vector function."); 4930 #endif 4931 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4932 } 4933 SmallVector<OperandBundleDef, 1> OpBundles; 4934 CI->getOperandBundlesAsDefs(OpBundles); 4935 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4936 4937 if (isa<FPMathOperator>(V)) 4938 V->copyFastMathFlags(CI); 4939 4940 State.set(Def, &I, V, Part); 4941 addMetadata(V, &I); 4942 } 4943 } 4944 4945 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4946 VPUser &Operands, 4947 bool InvariantCond, 4948 VPTransformState &State) { 4949 setDebugLocFromInst(Builder, &I); 4950 4951 // The condition can be loop invariant but still defined inside the 4952 // loop. This means that we can't just use the original 'cond' value. 4953 // We have to take the 'vectorized' value and pick the first lane. 4954 // Instcombine will make this a no-op. 4955 auto *InvarCond = 4956 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4957 4958 for (unsigned Part = 0; Part < UF; ++Part) { 4959 Value *Cond = 4960 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4961 Value *Op0 = State.get(Operands.getOperand(1), Part); 4962 Value *Op1 = State.get(Operands.getOperand(2), Part); 4963 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4964 State.set(VPDef, &I, Sel, Part); 4965 addMetadata(Sel, &I); 4966 } 4967 } 4968 4969 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4970 // We should not collect Scalars more than once per VF. Right now, this 4971 // function is called from collectUniformsAndScalars(), which already does 4972 // this check. Collecting Scalars for VF=1 does not make any sense. 4973 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4974 "This function should not be visited twice for the same VF"); 4975 4976 SmallSetVector<Instruction *, 8> Worklist; 4977 4978 // These sets are used to seed the analysis with pointers used by memory 4979 // accesses that will remain scalar. 4980 SmallSetVector<Instruction *, 8> ScalarPtrs; 4981 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4982 auto *Latch = TheLoop->getLoopLatch(); 4983 4984 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4985 // The pointer operands of loads and stores will be scalar as long as the 4986 // memory access is not a gather or scatter operation. The value operand of a 4987 // store will remain scalar if the store is scalarized. 4988 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4989 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4990 assert(WideningDecision != CM_Unknown && 4991 "Widening decision should be ready at this moment"); 4992 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4993 if (Ptr == Store->getValueOperand()) 4994 return WideningDecision == CM_Scalarize; 4995 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4996 "Ptr is neither a value or pointer operand"); 4997 return WideningDecision != CM_GatherScatter; 4998 }; 4999 5000 // A helper that returns true if the given value is a bitcast or 5001 // getelementptr instruction contained in the loop. 5002 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5003 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5004 isa<GetElementPtrInst>(V)) && 5005 !TheLoop->isLoopInvariant(V); 5006 }; 5007 5008 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5009 if (!isa<PHINode>(Ptr) || 5010 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5011 return false; 5012 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5013 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5014 return false; 5015 return isScalarUse(MemAccess, Ptr); 5016 }; 5017 5018 // A helper that evaluates a memory access's use of a pointer. If the 5019 // pointer is actually the pointer induction of a loop, it is being 5020 // inserted into Worklist. If the use will be a scalar use, and the 5021 // pointer is only used by memory accesses, we place the pointer in 5022 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5023 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5024 if (isScalarPtrInduction(MemAccess, Ptr)) { 5025 Worklist.insert(cast<Instruction>(Ptr)); 5026 Instruction *Update = cast<Instruction>( 5027 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5028 Worklist.insert(Update); 5029 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5030 << "\n"); 5031 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5032 << "\n"); 5033 return; 5034 } 5035 // We only care about bitcast and getelementptr instructions contained in 5036 // the loop. 5037 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5038 return; 5039 5040 // If the pointer has already been identified as scalar (e.g., if it was 5041 // also identified as uniform), there's nothing to do. 5042 auto *I = cast<Instruction>(Ptr); 5043 if (Worklist.count(I)) 5044 return; 5045 5046 // If the use of the pointer will be a scalar use, and all users of the 5047 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5048 // place the pointer in PossibleNonScalarPtrs. 5049 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5050 return isa<LoadInst>(U) || isa<StoreInst>(U); 5051 })) 5052 ScalarPtrs.insert(I); 5053 else 5054 PossibleNonScalarPtrs.insert(I); 5055 }; 5056 5057 // We seed the scalars analysis with three classes of instructions: (1) 5058 // instructions marked uniform-after-vectorization and (2) bitcast, 5059 // getelementptr and (pointer) phi instructions used by memory accesses 5060 // requiring a scalar use. 5061 // 5062 // (1) Add to the worklist all instructions that have been identified as 5063 // uniform-after-vectorization. 5064 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5065 5066 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5067 // memory accesses requiring a scalar use. The pointer operands of loads and 5068 // stores will be scalar as long as the memory accesses is not a gather or 5069 // scatter operation. The value operand of a store will remain scalar if the 5070 // store is scalarized. 5071 for (auto *BB : TheLoop->blocks()) 5072 for (auto &I : *BB) { 5073 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5074 evaluatePtrUse(Load, Load->getPointerOperand()); 5075 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5076 evaluatePtrUse(Store, Store->getPointerOperand()); 5077 evaluatePtrUse(Store, Store->getValueOperand()); 5078 } 5079 } 5080 for (auto *I : ScalarPtrs) 5081 if (!PossibleNonScalarPtrs.count(I)) { 5082 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5083 Worklist.insert(I); 5084 } 5085 5086 // Insert the forced scalars. 5087 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5088 // induction variable when the PHI user is scalarized. 5089 auto ForcedScalar = ForcedScalars.find(VF); 5090 if (ForcedScalar != ForcedScalars.end()) 5091 for (auto *I : ForcedScalar->second) 5092 Worklist.insert(I); 5093 5094 // Expand the worklist by looking through any bitcasts and getelementptr 5095 // instructions we've already identified as scalar. This is similar to the 5096 // expansion step in collectLoopUniforms(); however, here we're only 5097 // expanding to include additional bitcasts and getelementptr instructions. 5098 unsigned Idx = 0; 5099 while (Idx != Worklist.size()) { 5100 Instruction *Dst = Worklist[Idx++]; 5101 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5102 continue; 5103 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5104 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5105 auto *J = cast<Instruction>(U); 5106 return !TheLoop->contains(J) || Worklist.count(J) || 5107 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5108 isScalarUse(J, Src)); 5109 })) { 5110 Worklist.insert(Src); 5111 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5112 } 5113 } 5114 5115 // An induction variable will remain scalar if all users of the induction 5116 // variable and induction variable update remain scalar. 5117 for (auto &Induction : Legal->getInductionVars()) { 5118 auto *Ind = Induction.first; 5119 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5120 5121 // If tail-folding is applied, the primary induction variable will be used 5122 // to feed a vector compare. 5123 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5124 continue; 5125 5126 // Determine if all users of the induction variable are scalar after 5127 // vectorization. 5128 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5129 auto *I = cast<Instruction>(U); 5130 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5131 }); 5132 if (!ScalarInd) 5133 continue; 5134 5135 // Determine if all users of the induction variable update instruction are 5136 // scalar after vectorization. 5137 auto ScalarIndUpdate = 5138 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5139 auto *I = cast<Instruction>(U); 5140 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5141 }); 5142 if (!ScalarIndUpdate) 5143 continue; 5144 5145 // The induction variable and its update instruction will remain scalar. 5146 Worklist.insert(Ind); 5147 Worklist.insert(IndUpdate); 5148 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5149 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5150 << "\n"); 5151 } 5152 5153 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5154 } 5155 5156 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5157 ElementCount VF) { 5158 if (!blockNeedsPredication(I->getParent())) 5159 return false; 5160 switch(I->getOpcode()) { 5161 default: 5162 break; 5163 case Instruction::Load: 5164 case Instruction::Store: { 5165 if (!Legal->isMaskRequired(I)) 5166 return false; 5167 auto *Ptr = getLoadStorePointerOperand(I); 5168 auto *Ty = getMemInstValueType(I); 5169 // We have already decided how to vectorize this instruction, get that 5170 // result. 5171 if (VF.isVector()) { 5172 InstWidening WideningDecision = getWideningDecision(I, VF); 5173 assert(WideningDecision != CM_Unknown && 5174 "Widening decision should be ready at this moment"); 5175 return WideningDecision == CM_Scalarize; 5176 } 5177 const Align Alignment = getLoadStoreAlignment(I); 5178 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5179 isLegalMaskedGather(Ty, Alignment)) 5180 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5181 isLegalMaskedScatter(Ty, Alignment)); 5182 } 5183 case Instruction::UDiv: 5184 case Instruction::SDiv: 5185 case Instruction::SRem: 5186 case Instruction::URem: 5187 return mayDivideByZero(*I); 5188 } 5189 return false; 5190 } 5191 5192 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5193 Instruction *I, ElementCount VF) { 5194 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5195 assert(getWideningDecision(I, VF) == CM_Unknown && 5196 "Decision should not be set yet."); 5197 auto *Group = getInterleavedAccessGroup(I); 5198 assert(Group && "Must have a group."); 5199 5200 // If the instruction's allocated size doesn't equal it's type size, it 5201 // requires padding and will be scalarized. 5202 auto &DL = I->getModule()->getDataLayout(); 5203 auto *ScalarTy = getMemInstValueType(I); 5204 if (hasIrregularType(ScalarTy, DL, VF)) 5205 return false; 5206 5207 // Check if masking is required. 5208 // A Group may need masking for one of two reasons: it resides in a block that 5209 // needs predication, or it was decided to use masking to deal with gaps. 5210 bool PredicatedAccessRequiresMasking = 5211 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5212 bool AccessWithGapsRequiresMasking = 5213 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5214 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5215 return true; 5216 5217 // If masked interleaving is required, we expect that the user/target had 5218 // enabled it, because otherwise it either wouldn't have been created or 5219 // it should have been invalidated by the CostModel. 5220 assert(useMaskedInterleavedAccesses(TTI) && 5221 "Masked interleave-groups for predicated accesses are not enabled."); 5222 5223 auto *Ty = getMemInstValueType(I); 5224 const Align Alignment = getLoadStoreAlignment(I); 5225 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5226 : TTI.isLegalMaskedStore(Ty, Alignment); 5227 } 5228 5229 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5230 Instruction *I, ElementCount VF) { 5231 // Get and ensure we have a valid memory instruction. 5232 LoadInst *LI = dyn_cast<LoadInst>(I); 5233 StoreInst *SI = dyn_cast<StoreInst>(I); 5234 assert((LI || SI) && "Invalid memory instruction"); 5235 5236 auto *Ptr = getLoadStorePointerOperand(I); 5237 5238 // In order to be widened, the pointer should be consecutive, first of all. 5239 if (!Legal->isConsecutivePtr(Ptr)) 5240 return false; 5241 5242 // If the instruction is a store located in a predicated block, it will be 5243 // scalarized. 5244 if (isScalarWithPredication(I)) 5245 return false; 5246 5247 // If the instruction's allocated size doesn't equal it's type size, it 5248 // requires padding and will be scalarized. 5249 auto &DL = I->getModule()->getDataLayout(); 5250 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5251 if (hasIrregularType(ScalarTy, DL, VF)) 5252 return false; 5253 5254 return true; 5255 } 5256 5257 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5258 // We should not collect Uniforms more than once per VF. Right now, 5259 // this function is called from collectUniformsAndScalars(), which 5260 // already does this check. Collecting Uniforms for VF=1 does not make any 5261 // sense. 5262 5263 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5264 "This function should not be visited twice for the same VF"); 5265 5266 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5267 // not analyze again. Uniforms.count(VF) will return 1. 5268 Uniforms[VF].clear(); 5269 5270 // We now know that the loop is vectorizable! 5271 // Collect instructions inside the loop that will remain uniform after 5272 // vectorization. 5273 5274 // Global values, params and instructions outside of current loop are out of 5275 // scope. 5276 auto isOutOfScope = [&](Value *V) -> bool { 5277 Instruction *I = dyn_cast<Instruction>(V); 5278 return (!I || !TheLoop->contains(I)); 5279 }; 5280 5281 SetVector<Instruction *> Worklist; 5282 BasicBlock *Latch = TheLoop->getLoopLatch(); 5283 5284 // Instructions that are scalar with predication must not be considered 5285 // uniform after vectorization, because that would create an erroneous 5286 // replicating region where only a single instance out of VF should be formed. 5287 // TODO: optimize such seldom cases if found important, see PR40816. 5288 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5289 if (isOutOfScope(I)) { 5290 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5291 << *I << "\n"); 5292 return; 5293 } 5294 if (isScalarWithPredication(I, VF)) { 5295 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5296 << *I << "\n"); 5297 return; 5298 } 5299 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5300 Worklist.insert(I); 5301 }; 5302 5303 // Start with the conditional branch. If the branch condition is an 5304 // instruction contained in the loop that is only used by the branch, it is 5305 // uniform. 5306 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5307 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5308 addToWorklistIfAllowed(Cmp); 5309 5310 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5311 InstWidening WideningDecision = getWideningDecision(I, VF); 5312 assert(WideningDecision != CM_Unknown && 5313 "Widening decision should be ready at this moment"); 5314 5315 // A uniform memory op is itself uniform. We exclude uniform stores 5316 // here as they demand the last lane, not the first one. 5317 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5318 assert(WideningDecision == CM_Scalarize); 5319 return true; 5320 } 5321 5322 return (WideningDecision == CM_Widen || 5323 WideningDecision == CM_Widen_Reverse || 5324 WideningDecision == CM_Interleave); 5325 }; 5326 5327 5328 // Returns true if Ptr is the pointer operand of a memory access instruction 5329 // I, and I is known to not require scalarization. 5330 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5331 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5332 }; 5333 5334 // Holds a list of values which are known to have at least one uniform use. 5335 // Note that there may be other uses which aren't uniform. A "uniform use" 5336 // here is something which only demands lane 0 of the unrolled iterations; 5337 // it does not imply that all lanes produce the same value (e.g. this is not 5338 // the usual meaning of uniform) 5339 SmallPtrSet<Value *, 8> HasUniformUse; 5340 5341 // Scan the loop for instructions which are either a) known to have only 5342 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5343 for (auto *BB : TheLoop->blocks()) 5344 for (auto &I : *BB) { 5345 // If there's no pointer operand, there's nothing to do. 5346 auto *Ptr = getLoadStorePointerOperand(&I); 5347 if (!Ptr) 5348 continue; 5349 5350 // A uniform memory op is itself uniform. We exclude uniform stores 5351 // here as they demand the last lane, not the first one. 5352 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5353 addToWorklistIfAllowed(&I); 5354 5355 if (isUniformDecision(&I, VF)) { 5356 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5357 HasUniformUse.insert(Ptr); 5358 } 5359 } 5360 5361 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5362 // demanding) users. Since loops are assumed to be in LCSSA form, this 5363 // disallows uses outside the loop as well. 5364 for (auto *V : HasUniformUse) { 5365 if (isOutOfScope(V)) 5366 continue; 5367 auto *I = cast<Instruction>(V); 5368 auto UsersAreMemAccesses = 5369 llvm::all_of(I->users(), [&](User *U) -> bool { 5370 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5371 }); 5372 if (UsersAreMemAccesses) 5373 addToWorklistIfAllowed(I); 5374 } 5375 5376 // Expand Worklist in topological order: whenever a new instruction 5377 // is added , its users should be already inside Worklist. It ensures 5378 // a uniform instruction will only be used by uniform instructions. 5379 unsigned idx = 0; 5380 while (idx != Worklist.size()) { 5381 Instruction *I = Worklist[idx++]; 5382 5383 for (auto OV : I->operand_values()) { 5384 // isOutOfScope operands cannot be uniform instructions. 5385 if (isOutOfScope(OV)) 5386 continue; 5387 // First order recurrence Phi's should typically be considered 5388 // non-uniform. 5389 auto *OP = dyn_cast<PHINode>(OV); 5390 if (OP && Legal->isFirstOrderRecurrence(OP)) 5391 continue; 5392 // If all the users of the operand are uniform, then add the 5393 // operand into the uniform worklist. 5394 auto *OI = cast<Instruction>(OV); 5395 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5396 auto *J = cast<Instruction>(U); 5397 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5398 })) 5399 addToWorklistIfAllowed(OI); 5400 } 5401 } 5402 5403 // For an instruction to be added into Worklist above, all its users inside 5404 // the loop should also be in Worklist. However, this condition cannot be 5405 // true for phi nodes that form a cyclic dependence. We must process phi 5406 // nodes separately. An induction variable will remain uniform if all users 5407 // of the induction variable and induction variable update remain uniform. 5408 // The code below handles both pointer and non-pointer induction variables. 5409 for (auto &Induction : Legal->getInductionVars()) { 5410 auto *Ind = Induction.first; 5411 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5412 5413 // Determine if all users of the induction variable are uniform after 5414 // vectorization. 5415 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5416 auto *I = cast<Instruction>(U); 5417 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5418 isVectorizedMemAccessUse(I, Ind); 5419 }); 5420 if (!UniformInd) 5421 continue; 5422 5423 // Determine if all users of the induction variable update instruction are 5424 // uniform after vectorization. 5425 auto UniformIndUpdate = 5426 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5427 auto *I = cast<Instruction>(U); 5428 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5429 isVectorizedMemAccessUse(I, IndUpdate); 5430 }); 5431 if (!UniformIndUpdate) 5432 continue; 5433 5434 // The induction variable and its update instruction will remain uniform. 5435 addToWorklistIfAllowed(Ind); 5436 addToWorklistIfAllowed(IndUpdate); 5437 } 5438 5439 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5440 } 5441 5442 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5443 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5444 5445 if (Legal->getRuntimePointerChecking()->Need) { 5446 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5447 "runtime pointer checks needed. Enable vectorization of this " 5448 "loop with '#pragma clang loop vectorize(enable)' when " 5449 "compiling with -Os/-Oz", 5450 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5451 return true; 5452 } 5453 5454 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5455 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5456 "runtime SCEV checks needed. Enable vectorization of this " 5457 "loop with '#pragma clang loop vectorize(enable)' when " 5458 "compiling with -Os/-Oz", 5459 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5460 return true; 5461 } 5462 5463 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5464 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5465 reportVectorizationFailure("Runtime stride check for small trip count", 5466 "runtime stride == 1 checks needed. Enable vectorization of " 5467 "this loop without such check by compiling with -Os/-Oz", 5468 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5469 return true; 5470 } 5471 5472 return false; 5473 } 5474 5475 Optional<ElementCount> 5476 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5477 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5478 // TODO: It may by useful to do since it's still likely to be dynamically 5479 // uniform if the target can skip. 5480 reportVectorizationFailure( 5481 "Not inserting runtime ptr check for divergent target", 5482 "runtime pointer checks needed. Not enabled for divergent target", 5483 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5484 return None; 5485 } 5486 5487 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5488 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5489 if (TC == 1) { 5490 reportVectorizationFailure("Single iteration (non) loop", 5491 "loop trip count is one, irrelevant for vectorization", 5492 "SingleIterationLoop", ORE, TheLoop); 5493 return None; 5494 } 5495 5496 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5497 5498 switch (ScalarEpilogueStatus) { 5499 case CM_ScalarEpilogueAllowed: 5500 return MaxVF; 5501 case CM_ScalarEpilogueNotAllowedUsePredicate: 5502 LLVM_FALLTHROUGH; 5503 case CM_ScalarEpilogueNotNeededUsePredicate: 5504 LLVM_DEBUG( 5505 dbgs() << "LV: vector predicate hint/switch found.\n" 5506 << "LV: Not allowing scalar epilogue, creating predicated " 5507 << "vector loop.\n"); 5508 break; 5509 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5510 // fallthrough as a special case of OptForSize 5511 case CM_ScalarEpilogueNotAllowedOptSize: 5512 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5513 LLVM_DEBUG( 5514 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5515 else 5516 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5517 << "count.\n"); 5518 5519 // Bail if runtime checks are required, which are not good when optimising 5520 // for size. 5521 if (runtimeChecksRequired()) 5522 return None; 5523 5524 break; 5525 } 5526 5527 // The only loops we can vectorize without a scalar epilogue, are loops with 5528 // a bottom-test and a single exiting block. We'd have to handle the fact 5529 // that not every instruction executes on the last iteration. This will 5530 // require a lane mask which varies through the vector loop body. (TODO) 5531 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5532 // If there was a tail-folding hint/switch, but we can't fold the tail by 5533 // masking, fallback to a vectorization with a scalar epilogue. 5534 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5535 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5536 "scalar epilogue instead.\n"); 5537 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5538 return MaxVF; 5539 } 5540 return None; 5541 } 5542 5543 // Now try the tail folding 5544 5545 // Invalidate interleave groups that require an epilogue if we can't mask 5546 // the interleave-group. 5547 if (!useMaskedInterleavedAccesses(TTI)) { 5548 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5549 "No decisions should have been taken at this point"); 5550 // Note: There is no need to invalidate any cost modeling decisions here, as 5551 // non where taken so far. 5552 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5553 } 5554 5555 assert(!MaxVF.isScalable() && 5556 "Scalable vectors do not yet support tail folding"); 5557 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5558 "MaxVF must be a power of 2"); 5559 unsigned MaxVFtimesIC = 5560 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5561 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5562 // chose. 5563 ScalarEvolution *SE = PSE.getSE(); 5564 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5565 const SCEV *ExitCount = SE->getAddExpr( 5566 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5567 const SCEV *Rem = SE->getURemExpr( 5568 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5569 if (Rem->isZero()) { 5570 // Accept MaxVF if we do not have a tail. 5571 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5572 return MaxVF; 5573 } 5574 5575 // If we don't know the precise trip count, or if the trip count that we 5576 // found modulo the vectorization factor is not zero, try to fold the tail 5577 // by masking. 5578 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5579 if (Legal->prepareToFoldTailByMasking()) { 5580 FoldTailByMasking = true; 5581 return MaxVF; 5582 } 5583 5584 // If there was a tail-folding hint/switch, but we can't fold the tail by 5585 // masking, fallback to a vectorization with a scalar epilogue. 5586 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5587 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5588 "scalar epilogue instead.\n"); 5589 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5590 return MaxVF; 5591 } 5592 5593 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5594 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5595 return None; 5596 } 5597 5598 if (TC == 0) { 5599 reportVectorizationFailure( 5600 "Unable to calculate the loop count due to complex control flow", 5601 "unable to calculate the loop count due to complex control flow", 5602 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5603 return None; 5604 } 5605 5606 reportVectorizationFailure( 5607 "Cannot optimize for size and vectorize at the same time.", 5608 "cannot optimize for size and vectorize at the same time. " 5609 "Enable vectorization of this loop with '#pragma clang loop " 5610 "vectorize(enable)' when compiling with -Os/-Oz", 5611 "NoTailLoopWithOptForSize", ORE, TheLoop); 5612 return None; 5613 } 5614 5615 ElementCount 5616 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5617 ElementCount UserVF) { 5618 bool IgnoreScalableUserVF = UserVF.isScalable() && 5619 !TTI.supportsScalableVectors() && 5620 !ForceTargetSupportsScalableVectors; 5621 if (IgnoreScalableUserVF) { 5622 LLVM_DEBUG( 5623 dbgs() << "LV: Ignoring VF=" << UserVF 5624 << " because target does not support scalable vectors.\n"); 5625 ORE->emit([&]() { 5626 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5627 TheLoop->getStartLoc(), 5628 TheLoop->getHeader()) 5629 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5630 << " because target does not support scalable vectors."; 5631 }); 5632 } 5633 5634 // Beyond this point two scenarios are handled. If UserVF isn't specified 5635 // then a suitable VF is chosen. If UserVF is specified and there are 5636 // dependencies, check if it's legal. However, if a UserVF is specified and 5637 // there are no dependencies, then there's nothing to do. 5638 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5639 Legal->isSafeForAnyVectorWidth()) 5640 return UserVF; 5641 5642 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5643 unsigned SmallestType, WidestType; 5644 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5645 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5646 5647 // Get the maximum safe dependence distance in bits computed by LAA. 5648 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5649 // the memory accesses that is most restrictive (involved in the smallest 5650 // dependence distance). 5651 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5652 5653 // If the user vectorization factor is legally unsafe, clamp it to a safe 5654 // value. Otherwise, return as is. 5655 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5656 unsigned MaxSafeElements = 5657 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5658 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5659 5660 if (UserVF.isScalable()) { 5661 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5662 5663 // Scale VF by vscale before checking if it's safe. 5664 MaxSafeVF = ElementCount::getScalable( 5665 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5666 5667 if (MaxSafeVF.isZero()) { 5668 // The dependence distance is too small to use scalable vectors, 5669 // fallback on fixed. 5670 LLVM_DEBUG( 5671 dbgs() 5672 << "LV: Max legal vector width too small, scalable vectorization " 5673 "unfeasible. Using fixed-width vectorization instead.\n"); 5674 ORE->emit([&]() { 5675 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5676 TheLoop->getStartLoc(), 5677 TheLoop->getHeader()) 5678 << "Max legal vector width too small, scalable vectorization " 5679 << "unfeasible. Using fixed-width vectorization instead."; 5680 }); 5681 return computeFeasibleMaxVF( 5682 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5683 } 5684 } 5685 5686 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5687 5688 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5689 return UserVF; 5690 5691 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5692 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5693 << ".\n"); 5694 ORE->emit([&]() { 5695 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5696 TheLoop->getStartLoc(), 5697 TheLoop->getHeader()) 5698 << "User-specified vectorization factor " 5699 << ore::NV("UserVectorizationFactor", UserVF) 5700 << " is unsafe, clamping to maximum safe vectorization factor " 5701 << ore::NV("VectorizationFactor", MaxSafeVF); 5702 }); 5703 return MaxSafeVF; 5704 } 5705 5706 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5707 5708 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5709 // Note that both WidestRegister and WidestType may not be a powers of 2. 5710 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5711 5712 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5713 << " / " << WidestType << " bits.\n"); 5714 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5715 << WidestRegister << " bits.\n"); 5716 5717 assert(MaxVectorSize <= WidestRegister && 5718 "Did not expect to pack so many elements" 5719 " into one vector!"); 5720 if (MaxVectorSize == 0) { 5721 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5722 MaxVectorSize = 1; 5723 return ElementCount::getFixed(MaxVectorSize); 5724 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5725 isPowerOf2_32(ConstTripCount)) { 5726 // We need to clamp the VF to be the ConstTripCount. There is no point in 5727 // choosing a higher viable VF as done in the loop below. 5728 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5729 << ConstTripCount << "\n"); 5730 MaxVectorSize = ConstTripCount; 5731 return ElementCount::getFixed(MaxVectorSize); 5732 } 5733 5734 unsigned MaxVF = MaxVectorSize; 5735 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5736 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5737 // Collect all viable vectorization factors larger than the default MaxVF 5738 // (i.e. MaxVectorSize). 5739 SmallVector<ElementCount, 8> VFs; 5740 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5741 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5742 VFs.push_back(ElementCount::getFixed(VS)); 5743 5744 // For each VF calculate its register usage. 5745 auto RUs = calculateRegisterUsage(VFs); 5746 5747 // Select the largest VF which doesn't require more registers than existing 5748 // ones. 5749 for (int i = RUs.size() - 1; i >= 0; --i) { 5750 bool Selected = true; 5751 for (auto& pair : RUs[i].MaxLocalUsers) { 5752 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5753 if (pair.second > TargetNumRegisters) 5754 Selected = false; 5755 } 5756 if (Selected) { 5757 MaxVF = VFs[i].getKnownMinValue(); 5758 break; 5759 } 5760 } 5761 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5762 if (MaxVF < MinVF) { 5763 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5764 << ") with target's minimum: " << MinVF << '\n'); 5765 MaxVF = MinVF; 5766 } 5767 } 5768 } 5769 return ElementCount::getFixed(MaxVF); 5770 } 5771 5772 VectorizationFactor 5773 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5774 // FIXME: This can be fixed for scalable vectors later, because at this stage 5775 // the LoopVectorizer will only consider vectorizing a loop with scalable 5776 // vectors when the loop has a hint to enable vectorization for a given VF. 5777 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5778 5779 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5780 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5781 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5782 5783 unsigned Width = 1; 5784 const float ScalarCost = *ExpectedCost.getValue(); 5785 float Cost = ScalarCost; 5786 5787 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5788 if (ForceVectorization && MaxVF.isVector()) { 5789 // Ignore scalar width, because the user explicitly wants vectorization. 5790 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5791 // evaluation. 5792 Cost = std::numeric_limits<float>::max(); 5793 } 5794 5795 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5796 // Notice that the vector loop needs to be executed less times, so 5797 // we need to divide the cost of the vector loops by the width of 5798 // the vector elements. 5799 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5800 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5801 float VectorCost = *C.first.getValue() / (float)i; 5802 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5803 << " costs: " << (int)VectorCost << ".\n"); 5804 if (!C.second && !ForceVectorization) { 5805 LLVM_DEBUG( 5806 dbgs() << "LV: Not considering vector loop of width " << i 5807 << " because it will not generate any vector instructions.\n"); 5808 continue; 5809 } 5810 5811 // If profitable add it to ProfitableVF list. 5812 if (VectorCost < ScalarCost) { 5813 ProfitableVFs.push_back(VectorizationFactor( 5814 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5815 } 5816 5817 if (VectorCost < Cost) { 5818 Cost = VectorCost; 5819 Width = i; 5820 } 5821 } 5822 5823 if (!EnableCondStoresVectorization && NumPredStores) { 5824 reportVectorizationFailure("There are conditional stores.", 5825 "store that is conditionally executed prevents vectorization", 5826 "ConditionalStore", ORE, TheLoop); 5827 Width = 1; 5828 Cost = ScalarCost; 5829 } 5830 5831 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5832 << "LV: Vectorization seems to be not beneficial, " 5833 << "but was forced by a user.\n"); 5834 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5835 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5836 (unsigned)(Width * Cost)}; 5837 return Factor; 5838 } 5839 5840 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5841 const Loop &L, ElementCount VF) const { 5842 // Cross iteration phis such as reductions need special handling and are 5843 // currently unsupported. 5844 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5845 return Legal->isFirstOrderRecurrence(&Phi) || 5846 Legal->isReductionVariable(&Phi); 5847 })) 5848 return false; 5849 5850 // Phis with uses outside of the loop require special handling and are 5851 // currently unsupported. 5852 for (auto &Entry : Legal->getInductionVars()) { 5853 // Look for uses of the value of the induction at the last iteration. 5854 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5855 for (User *U : PostInc->users()) 5856 if (!L.contains(cast<Instruction>(U))) 5857 return false; 5858 // Look for uses of penultimate value of the induction. 5859 for (User *U : Entry.first->users()) 5860 if (!L.contains(cast<Instruction>(U))) 5861 return false; 5862 } 5863 5864 // Induction variables that are widened require special handling that is 5865 // currently not supported. 5866 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5867 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5868 this->isProfitableToScalarize(Entry.first, VF)); 5869 })) 5870 return false; 5871 5872 return true; 5873 } 5874 5875 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5876 const ElementCount VF) const { 5877 // FIXME: We need a much better cost-model to take different parameters such 5878 // as register pressure, code size increase and cost of extra branches into 5879 // account. For now we apply a very crude heuristic and only consider loops 5880 // with vectorization factors larger than a certain value. 5881 // We also consider epilogue vectorization unprofitable for targets that don't 5882 // consider interleaving beneficial (eg. MVE). 5883 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5884 return false; 5885 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5886 return true; 5887 return false; 5888 } 5889 5890 VectorizationFactor 5891 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5892 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5893 VectorizationFactor Result = VectorizationFactor::Disabled(); 5894 if (!EnableEpilogueVectorization) { 5895 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5896 return Result; 5897 } 5898 5899 if (!isScalarEpilogueAllowed()) { 5900 LLVM_DEBUG( 5901 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5902 "allowed.\n";); 5903 return Result; 5904 } 5905 5906 // FIXME: This can be fixed for scalable vectors later, because at this stage 5907 // the LoopVectorizer will only consider vectorizing a loop with scalable 5908 // vectors when the loop has a hint to enable vectorization for a given VF. 5909 if (MainLoopVF.isScalable()) { 5910 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5911 "yet supported.\n"); 5912 return Result; 5913 } 5914 5915 // Not really a cost consideration, but check for unsupported cases here to 5916 // simplify the logic. 5917 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5918 LLVM_DEBUG( 5919 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5920 "not a supported candidate.\n";); 5921 return Result; 5922 } 5923 5924 if (EpilogueVectorizationForceVF > 1) { 5925 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5926 if (LVP.hasPlanWithVFs( 5927 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5928 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5929 else { 5930 LLVM_DEBUG( 5931 dbgs() 5932 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5933 return Result; 5934 } 5935 } 5936 5937 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5938 TheLoop->getHeader()->getParent()->hasMinSize()) { 5939 LLVM_DEBUG( 5940 dbgs() 5941 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5942 return Result; 5943 } 5944 5945 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5946 return Result; 5947 5948 for (auto &NextVF : ProfitableVFs) 5949 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5950 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5951 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5952 Result = NextVF; 5953 5954 if (Result != VectorizationFactor::Disabled()) 5955 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5956 << Result.Width.getFixedValue() << "\n";); 5957 return Result; 5958 } 5959 5960 std::pair<unsigned, unsigned> 5961 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5962 unsigned MinWidth = -1U; 5963 unsigned MaxWidth = 8; 5964 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5965 5966 // For each block. 5967 for (BasicBlock *BB : TheLoop->blocks()) { 5968 // For each instruction in the loop. 5969 for (Instruction &I : BB->instructionsWithoutDebug()) { 5970 Type *T = I.getType(); 5971 5972 // Skip ignored values. 5973 if (ValuesToIgnore.count(&I)) 5974 continue; 5975 5976 // Only examine Loads, Stores and PHINodes. 5977 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5978 continue; 5979 5980 // Examine PHI nodes that are reduction variables. Update the type to 5981 // account for the recurrence type. 5982 if (auto *PN = dyn_cast<PHINode>(&I)) { 5983 if (!Legal->isReductionVariable(PN)) 5984 continue; 5985 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5986 T = RdxDesc.getRecurrenceType(); 5987 } 5988 5989 // Examine the stored values. 5990 if (auto *ST = dyn_cast<StoreInst>(&I)) 5991 T = ST->getValueOperand()->getType(); 5992 5993 // Ignore loaded pointer types and stored pointer types that are not 5994 // vectorizable. 5995 // 5996 // FIXME: The check here attempts to predict whether a load or store will 5997 // be vectorized. We only know this for certain after a VF has 5998 // been selected. Here, we assume that if an access can be 5999 // vectorized, it will be. We should also look at extending this 6000 // optimization to non-pointer types. 6001 // 6002 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6003 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6004 continue; 6005 6006 MinWidth = std::min(MinWidth, 6007 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6008 MaxWidth = std::max(MaxWidth, 6009 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6010 } 6011 } 6012 6013 return {MinWidth, MaxWidth}; 6014 } 6015 6016 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6017 unsigned LoopCost) { 6018 // -- The interleave heuristics -- 6019 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6020 // There are many micro-architectural considerations that we can't predict 6021 // at this level. For example, frontend pressure (on decode or fetch) due to 6022 // code size, or the number and capabilities of the execution ports. 6023 // 6024 // We use the following heuristics to select the interleave count: 6025 // 1. If the code has reductions, then we interleave to break the cross 6026 // iteration dependency. 6027 // 2. If the loop is really small, then we interleave to reduce the loop 6028 // overhead. 6029 // 3. We don't interleave if we think that we will spill registers to memory 6030 // due to the increased register pressure. 6031 6032 if (!isScalarEpilogueAllowed()) 6033 return 1; 6034 6035 // We used the distance for the interleave count. 6036 if (Legal->getMaxSafeDepDistBytes() != -1U) 6037 return 1; 6038 6039 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6040 const bool HasReductions = !Legal->getReductionVars().empty(); 6041 // Do not interleave loops with a relatively small known or estimated trip 6042 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6043 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6044 // because with the above conditions interleaving can expose ILP and break 6045 // cross iteration dependences for reductions. 6046 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6047 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6048 return 1; 6049 6050 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6051 // We divide by these constants so assume that we have at least one 6052 // instruction that uses at least one register. 6053 for (auto& pair : R.MaxLocalUsers) { 6054 pair.second = std::max(pair.second, 1U); 6055 } 6056 6057 // We calculate the interleave count using the following formula. 6058 // Subtract the number of loop invariants from the number of available 6059 // registers. These registers are used by all of the interleaved instances. 6060 // Next, divide the remaining registers by the number of registers that is 6061 // required by the loop, in order to estimate how many parallel instances 6062 // fit without causing spills. All of this is rounded down if necessary to be 6063 // a power of two. We want power of two interleave count to simplify any 6064 // addressing operations or alignment considerations. 6065 // We also want power of two interleave counts to ensure that the induction 6066 // variable of the vector loop wraps to zero, when tail is folded by masking; 6067 // this currently happens when OptForSize, in which case IC is set to 1 above. 6068 unsigned IC = UINT_MAX; 6069 6070 for (auto& pair : R.MaxLocalUsers) { 6071 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6072 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6073 << " registers of " 6074 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6075 if (VF.isScalar()) { 6076 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6077 TargetNumRegisters = ForceTargetNumScalarRegs; 6078 } else { 6079 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6080 TargetNumRegisters = ForceTargetNumVectorRegs; 6081 } 6082 unsigned MaxLocalUsers = pair.second; 6083 unsigned LoopInvariantRegs = 0; 6084 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6085 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6086 6087 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6088 // Don't count the induction variable as interleaved. 6089 if (EnableIndVarRegisterHeur) { 6090 TmpIC = 6091 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6092 std::max(1U, (MaxLocalUsers - 1))); 6093 } 6094 6095 IC = std::min(IC, TmpIC); 6096 } 6097 6098 // Clamp the interleave ranges to reasonable counts. 6099 unsigned MaxInterleaveCount = 6100 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6101 6102 // Check if the user has overridden the max. 6103 if (VF.isScalar()) { 6104 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6105 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6106 } else { 6107 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6108 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6109 } 6110 6111 // If trip count is known or estimated compile time constant, limit the 6112 // interleave count to be less than the trip count divided by VF, provided it 6113 // is at least 1. 6114 // 6115 // For scalable vectors we can't know if interleaving is beneficial. It may 6116 // not be beneficial for small loops if none of the lanes in the second vector 6117 // iterations is enabled. However, for larger loops, there is likely to be a 6118 // similar benefit as for fixed-width vectors. For now, we choose to leave 6119 // the InterleaveCount as if vscale is '1', although if some information about 6120 // the vector is known (e.g. min vector size), we can make a better decision. 6121 if (BestKnownTC) { 6122 MaxInterleaveCount = 6123 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6124 // Make sure MaxInterleaveCount is greater than 0. 6125 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6126 } 6127 6128 assert(MaxInterleaveCount > 0 && 6129 "Maximum interleave count must be greater than 0"); 6130 6131 // Clamp the calculated IC to be between the 1 and the max interleave count 6132 // that the target and trip count allows. 6133 if (IC > MaxInterleaveCount) 6134 IC = MaxInterleaveCount; 6135 else 6136 // Make sure IC is greater than 0. 6137 IC = std::max(1u, IC); 6138 6139 assert(IC > 0 && "Interleave count must be greater than 0."); 6140 6141 // If we did not calculate the cost for VF (because the user selected the VF) 6142 // then we calculate the cost of VF here. 6143 if (LoopCost == 0) { 6144 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6145 LoopCost = *expectedCost(VF).first.getValue(); 6146 } 6147 6148 assert(LoopCost && "Non-zero loop cost expected"); 6149 6150 // Interleave if we vectorized this loop and there is a reduction that could 6151 // benefit from interleaving. 6152 if (VF.isVector() && HasReductions) { 6153 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6154 return IC; 6155 } 6156 6157 // Note that if we've already vectorized the loop we will have done the 6158 // runtime check and so interleaving won't require further checks. 6159 bool InterleavingRequiresRuntimePointerCheck = 6160 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6161 6162 // We want to interleave small loops in order to reduce the loop overhead and 6163 // potentially expose ILP opportunities. 6164 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6165 << "LV: IC is " << IC << '\n' 6166 << "LV: VF is " << VF << '\n'); 6167 const bool AggressivelyInterleaveReductions = 6168 TTI.enableAggressiveInterleaving(HasReductions); 6169 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6170 // We assume that the cost overhead is 1 and we use the cost model 6171 // to estimate the cost of the loop and interleave until the cost of the 6172 // loop overhead is about 5% of the cost of the loop. 6173 unsigned SmallIC = 6174 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6175 6176 // Interleave until store/load ports (estimated by max interleave count) are 6177 // saturated. 6178 unsigned NumStores = Legal->getNumStores(); 6179 unsigned NumLoads = Legal->getNumLoads(); 6180 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6181 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6182 6183 // If we have a scalar reduction (vector reductions are already dealt with 6184 // by this point), we can increase the critical path length if the loop 6185 // we're interleaving is inside another loop. Limit, by default to 2, so the 6186 // critical path only gets increased by one reduction operation. 6187 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6188 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6189 SmallIC = std::min(SmallIC, F); 6190 StoresIC = std::min(StoresIC, F); 6191 LoadsIC = std::min(LoadsIC, F); 6192 } 6193 6194 if (EnableLoadStoreRuntimeInterleave && 6195 std::max(StoresIC, LoadsIC) > SmallIC) { 6196 LLVM_DEBUG( 6197 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6198 return std::max(StoresIC, LoadsIC); 6199 } 6200 6201 // If there are scalar reductions and TTI has enabled aggressive 6202 // interleaving for reductions, we will interleave to expose ILP. 6203 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6204 AggressivelyInterleaveReductions) { 6205 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6206 // Interleave no less than SmallIC but not as aggressive as the normal IC 6207 // to satisfy the rare situation when resources are too limited. 6208 return std::max(IC / 2, SmallIC); 6209 } else { 6210 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6211 return SmallIC; 6212 } 6213 } 6214 6215 // Interleave if this is a large loop (small loops are already dealt with by 6216 // this point) that could benefit from interleaving. 6217 if (AggressivelyInterleaveReductions) { 6218 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6219 return IC; 6220 } 6221 6222 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6223 return 1; 6224 } 6225 6226 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6227 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6228 // This function calculates the register usage by measuring the highest number 6229 // of values that are alive at a single location. Obviously, this is a very 6230 // rough estimation. We scan the loop in a topological order in order and 6231 // assign a number to each instruction. We use RPO to ensure that defs are 6232 // met before their users. We assume that each instruction that has in-loop 6233 // users starts an interval. We record every time that an in-loop value is 6234 // used, so we have a list of the first and last occurrences of each 6235 // instruction. Next, we transpose this data structure into a multi map that 6236 // holds the list of intervals that *end* at a specific location. This multi 6237 // map allows us to perform a linear search. We scan the instructions linearly 6238 // and record each time that a new interval starts, by placing it in a set. 6239 // If we find this value in the multi-map then we remove it from the set. 6240 // The max register usage is the maximum size of the set. 6241 // We also search for instructions that are defined outside the loop, but are 6242 // used inside the loop. We need this number separately from the max-interval 6243 // usage number because when we unroll, loop-invariant values do not take 6244 // more register. 6245 LoopBlocksDFS DFS(TheLoop); 6246 DFS.perform(LI); 6247 6248 RegisterUsage RU; 6249 6250 // Each 'key' in the map opens a new interval. The values 6251 // of the map are the index of the 'last seen' usage of the 6252 // instruction that is the key. 6253 using IntervalMap = DenseMap<Instruction *, unsigned>; 6254 6255 // Maps instruction to its index. 6256 SmallVector<Instruction *, 64> IdxToInstr; 6257 // Marks the end of each interval. 6258 IntervalMap EndPoint; 6259 // Saves the list of instruction indices that are used in the loop. 6260 SmallPtrSet<Instruction *, 8> Ends; 6261 // Saves the list of values that are used in the loop but are 6262 // defined outside the loop, such as arguments and constants. 6263 SmallPtrSet<Value *, 8> LoopInvariants; 6264 6265 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6266 for (Instruction &I : BB->instructionsWithoutDebug()) { 6267 IdxToInstr.push_back(&I); 6268 6269 // Save the end location of each USE. 6270 for (Value *U : I.operands()) { 6271 auto *Instr = dyn_cast<Instruction>(U); 6272 6273 // Ignore non-instruction values such as arguments, constants, etc. 6274 if (!Instr) 6275 continue; 6276 6277 // If this instruction is outside the loop then record it and continue. 6278 if (!TheLoop->contains(Instr)) { 6279 LoopInvariants.insert(Instr); 6280 continue; 6281 } 6282 6283 // Overwrite previous end points. 6284 EndPoint[Instr] = IdxToInstr.size(); 6285 Ends.insert(Instr); 6286 } 6287 } 6288 } 6289 6290 // Saves the list of intervals that end with the index in 'key'. 6291 using InstrList = SmallVector<Instruction *, 2>; 6292 DenseMap<unsigned, InstrList> TransposeEnds; 6293 6294 // Transpose the EndPoints to a list of values that end at each index. 6295 for (auto &Interval : EndPoint) 6296 TransposeEnds[Interval.second].push_back(Interval.first); 6297 6298 SmallPtrSet<Instruction *, 8> OpenIntervals; 6299 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6300 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6301 6302 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6303 6304 // A lambda that gets the register usage for the given type and VF. 6305 const auto &TTICapture = TTI; 6306 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6307 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6308 return 0U; 6309 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6310 }; 6311 6312 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6313 Instruction *I = IdxToInstr[i]; 6314 6315 // Remove all of the instructions that end at this location. 6316 InstrList &List = TransposeEnds[i]; 6317 for (Instruction *ToRemove : List) 6318 OpenIntervals.erase(ToRemove); 6319 6320 // Ignore instructions that are never used within the loop. 6321 if (!Ends.count(I)) 6322 continue; 6323 6324 // Skip ignored values. 6325 if (ValuesToIgnore.count(I)) 6326 continue; 6327 6328 // For each VF find the maximum usage of registers. 6329 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6330 // Count the number of live intervals. 6331 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6332 6333 if (VFs[j].isScalar()) { 6334 for (auto Inst : OpenIntervals) { 6335 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6336 if (RegUsage.find(ClassID) == RegUsage.end()) 6337 RegUsage[ClassID] = 1; 6338 else 6339 RegUsage[ClassID] += 1; 6340 } 6341 } else { 6342 collectUniformsAndScalars(VFs[j]); 6343 for (auto Inst : OpenIntervals) { 6344 // Skip ignored values for VF > 1. 6345 if (VecValuesToIgnore.count(Inst)) 6346 continue; 6347 if (isScalarAfterVectorization(Inst, VFs[j])) { 6348 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6349 if (RegUsage.find(ClassID) == RegUsage.end()) 6350 RegUsage[ClassID] = 1; 6351 else 6352 RegUsage[ClassID] += 1; 6353 } else { 6354 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6355 if (RegUsage.find(ClassID) == RegUsage.end()) 6356 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6357 else 6358 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6359 } 6360 } 6361 } 6362 6363 for (auto& pair : RegUsage) { 6364 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6365 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6366 else 6367 MaxUsages[j][pair.first] = pair.second; 6368 } 6369 } 6370 6371 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6372 << OpenIntervals.size() << '\n'); 6373 6374 // Add the current instruction to the list of open intervals. 6375 OpenIntervals.insert(I); 6376 } 6377 6378 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6379 SmallMapVector<unsigned, unsigned, 4> Invariant; 6380 6381 for (auto Inst : LoopInvariants) { 6382 unsigned Usage = 6383 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6384 unsigned ClassID = 6385 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6386 if (Invariant.find(ClassID) == Invariant.end()) 6387 Invariant[ClassID] = Usage; 6388 else 6389 Invariant[ClassID] += Usage; 6390 } 6391 6392 LLVM_DEBUG({ 6393 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6394 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6395 << " item\n"; 6396 for (const auto &pair : MaxUsages[i]) { 6397 dbgs() << "LV(REG): RegisterClass: " 6398 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6399 << " registers\n"; 6400 } 6401 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6402 << " item\n"; 6403 for (const auto &pair : Invariant) { 6404 dbgs() << "LV(REG): RegisterClass: " 6405 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6406 << " registers\n"; 6407 } 6408 }); 6409 6410 RU.LoopInvariantRegs = Invariant; 6411 RU.MaxLocalUsers = MaxUsages[i]; 6412 RUs[i] = RU; 6413 } 6414 6415 return RUs; 6416 } 6417 6418 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6419 // TODO: Cost model for emulated masked load/store is completely 6420 // broken. This hack guides the cost model to use an artificially 6421 // high enough value to practically disable vectorization with such 6422 // operations, except where previously deployed legality hack allowed 6423 // using very low cost values. This is to avoid regressions coming simply 6424 // from moving "masked load/store" check from legality to cost model. 6425 // Masked Load/Gather emulation was previously never allowed. 6426 // Limited number of Masked Store/Scatter emulation was allowed. 6427 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6428 return isa<LoadInst>(I) || 6429 (isa<StoreInst>(I) && 6430 NumPredStores > NumberOfStoresToPredicate); 6431 } 6432 6433 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6434 // If we aren't vectorizing the loop, or if we've already collected the 6435 // instructions to scalarize, there's nothing to do. Collection may already 6436 // have occurred if we have a user-selected VF and are now computing the 6437 // expected cost for interleaving. 6438 if (VF.isScalar() || VF.isZero() || 6439 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6440 return; 6441 6442 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6443 // not profitable to scalarize any instructions, the presence of VF in the 6444 // map will indicate that we've analyzed it already. 6445 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6446 6447 // Find all the instructions that are scalar with predication in the loop and 6448 // determine if it would be better to not if-convert the blocks they are in. 6449 // If so, we also record the instructions to scalarize. 6450 for (BasicBlock *BB : TheLoop->blocks()) { 6451 if (!blockNeedsPredication(BB)) 6452 continue; 6453 for (Instruction &I : *BB) 6454 if (isScalarWithPredication(&I)) { 6455 ScalarCostsTy ScalarCosts; 6456 // Do not apply discount logic if hacked cost is needed 6457 // for emulated masked memrefs. 6458 if (!useEmulatedMaskMemRefHack(&I) && 6459 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6460 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6461 // Remember that BB will remain after vectorization. 6462 PredicatedBBsAfterVectorization.insert(BB); 6463 } 6464 } 6465 } 6466 6467 int LoopVectorizationCostModel::computePredInstDiscount( 6468 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6469 assert(!isUniformAfterVectorization(PredInst, VF) && 6470 "Instruction marked uniform-after-vectorization will be predicated"); 6471 6472 // Initialize the discount to zero, meaning that the scalar version and the 6473 // vector version cost the same. 6474 InstructionCost Discount = 0; 6475 6476 // Holds instructions to analyze. The instructions we visit are mapped in 6477 // ScalarCosts. Those instructions are the ones that would be scalarized if 6478 // we find that the scalar version costs less. 6479 SmallVector<Instruction *, 8> Worklist; 6480 6481 // Returns true if the given instruction can be scalarized. 6482 auto canBeScalarized = [&](Instruction *I) -> bool { 6483 // We only attempt to scalarize instructions forming a single-use chain 6484 // from the original predicated block that would otherwise be vectorized. 6485 // Although not strictly necessary, we give up on instructions we know will 6486 // already be scalar to avoid traversing chains that are unlikely to be 6487 // beneficial. 6488 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6489 isScalarAfterVectorization(I, VF)) 6490 return false; 6491 6492 // If the instruction is scalar with predication, it will be analyzed 6493 // separately. We ignore it within the context of PredInst. 6494 if (isScalarWithPredication(I)) 6495 return false; 6496 6497 // If any of the instruction's operands are uniform after vectorization, 6498 // the instruction cannot be scalarized. This prevents, for example, a 6499 // masked load from being scalarized. 6500 // 6501 // We assume we will only emit a value for lane zero of an instruction 6502 // marked uniform after vectorization, rather than VF identical values. 6503 // Thus, if we scalarize an instruction that uses a uniform, we would 6504 // create uses of values corresponding to the lanes we aren't emitting code 6505 // for. This behavior can be changed by allowing getScalarValue to clone 6506 // the lane zero values for uniforms rather than asserting. 6507 for (Use &U : I->operands()) 6508 if (auto *J = dyn_cast<Instruction>(U.get())) 6509 if (isUniformAfterVectorization(J, VF)) 6510 return false; 6511 6512 // Otherwise, we can scalarize the instruction. 6513 return true; 6514 }; 6515 6516 // Compute the expected cost discount from scalarizing the entire expression 6517 // feeding the predicated instruction. We currently only consider expressions 6518 // that are single-use instruction chains. 6519 Worklist.push_back(PredInst); 6520 while (!Worklist.empty()) { 6521 Instruction *I = Worklist.pop_back_val(); 6522 6523 // If we've already analyzed the instruction, there's nothing to do. 6524 if (ScalarCosts.find(I) != ScalarCosts.end()) 6525 continue; 6526 6527 // Compute the cost of the vector instruction. Note that this cost already 6528 // includes the scalarization overhead of the predicated instruction. 6529 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6530 6531 // Compute the cost of the scalarized instruction. This cost is the cost of 6532 // the instruction as if it wasn't if-converted and instead remained in the 6533 // predicated block. We will scale this cost by block probability after 6534 // computing the scalarization overhead. 6535 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6536 InstructionCost ScalarCost = 6537 VF.getKnownMinValue() * 6538 getInstructionCost(I, ElementCount::getFixed(1)).first; 6539 6540 // Compute the scalarization overhead of needed insertelement instructions 6541 // and phi nodes. 6542 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6543 ScalarCost += TTI.getScalarizationOverhead( 6544 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6545 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6546 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6547 ScalarCost += 6548 VF.getKnownMinValue() * 6549 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6550 } 6551 6552 // Compute the scalarization overhead of needed extractelement 6553 // instructions. For each of the instruction's operands, if the operand can 6554 // be scalarized, add it to the worklist; otherwise, account for the 6555 // overhead. 6556 for (Use &U : I->operands()) 6557 if (auto *J = dyn_cast<Instruction>(U.get())) { 6558 assert(VectorType::isValidElementType(J->getType()) && 6559 "Instruction has non-scalar type"); 6560 if (canBeScalarized(J)) 6561 Worklist.push_back(J); 6562 else if (needsExtract(J, VF)) { 6563 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6564 ScalarCost += TTI.getScalarizationOverhead( 6565 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6566 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6567 } 6568 } 6569 6570 // Scale the total scalar cost by block probability. 6571 ScalarCost /= getReciprocalPredBlockProb(); 6572 6573 // Compute the discount. A non-negative discount means the vector version 6574 // of the instruction costs more, and scalarizing would be beneficial. 6575 Discount += VectorCost - ScalarCost; 6576 ScalarCosts[I] = ScalarCost; 6577 } 6578 6579 return *Discount.getValue(); 6580 } 6581 6582 LoopVectorizationCostModel::VectorizationCostTy 6583 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6584 VectorizationCostTy Cost; 6585 6586 // For each block. 6587 for (BasicBlock *BB : TheLoop->blocks()) { 6588 VectorizationCostTy BlockCost; 6589 6590 // For each instruction in the old loop. 6591 for (Instruction &I : BB->instructionsWithoutDebug()) { 6592 // Skip ignored values. 6593 if (ValuesToIgnore.count(&I) || 6594 (VF.isVector() && VecValuesToIgnore.count(&I))) 6595 continue; 6596 6597 VectorizationCostTy C = getInstructionCost(&I, VF); 6598 6599 // Check if we should override the cost. 6600 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6601 C.first = InstructionCost(ForceTargetInstructionCost); 6602 6603 BlockCost.first += C.first; 6604 BlockCost.second |= C.second; 6605 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6606 << " for VF " << VF << " For instruction: " << I 6607 << '\n'); 6608 } 6609 6610 // If we are vectorizing a predicated block, it will have been 6611 // if-converted. This means that the block's instructions (aside from 6612 // stores and instructions that may divide by zero) will now be 6613 // unconditionally executed. For the scalar case, we may not always execute 6614 // the predicated block, if it is an if-else block. Thus, scale the block's 6615 // cost by the probability of executing it. blockNeedsPredication from 6616 // Legal is used so as to not include all blocks in tail folded loops. 6617 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6618 BlockCost.first /= getReciprocalPredBlockProb(); 6619 6620 Cost.first += BlockCost.first; 6621 Cost.second |= BlockCost.second; 6622 } 6623 6624 return Cost; 6625 } 6626 6627 /// Gets Address Access SCEV after verifying that the access pattern 6628 /// is loop invariant except the induction variable dependence. 6629 /// 6630 /// This SCEV can be sent to the Target in order to estimate the address 6631 /// calculation cost. 6632 static const SCEV *getAddressAccessSCEV( 6633 Value *Ptr, 6634 LoopVectorizationLegality *Legal, 6635 PredicatedScalarEvolution &PSE, 6636 const Loop *TheLoop) { 6637 6638 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6639 if (!Gep) 6640 return nullptr; 6641 6642 // We are looking for a gep with all loop invariant indices except for one 6643 // which should be an induction variable. 6644 auto SE = PSE.getSE(); 6645 unsigned NumOperands = Gep->getNumOperands(); 6646 for (unsigned i = 1; i < NumOperands; ++i) { 6647 Value *Opd = Gep->getOperand(i); 6648 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6649 !Legal->isInductionVariable(Opd)) 6650 return nullptr; 6651 } 6652 6653 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6654 return PSE.getSCEV(Ptr); 6655 } 6656 6657 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6658 return Legal->hasStride(I->getOperand(0)) || 6659 Legal->hasStride(I->getOperand(1)); 6660 } 6661 6662 InstructionCost 6663 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6664 ElementCount VF) { 6665 assert(VF.isVector() && 6666 "Scalarization cost of instruction implies vectorization."); 6667 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6668 Type *ValTy = getMemInstValueType(I); 6669 auto SE = PSE.getSE(); 6670 6671 unsigned AS = getLoadStoreAddressSpace(I); 6672 Value *Ptr = getLoadStorePointerOperand(I); 6673 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6674 6675 // Figure out whether the access is strided and get the stride value 6676 // if it's known in compile time 6677 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6678 6679 // Get the cost of the scalar memory instruction and address computation. 6680 InstructionCost Cost = 6681 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6682 6683 // Don't pass *I here, since it is scalar but will actually be part of a 6684 // vectorized loop where the user of it is a vectorized instruction. 6685 const Align Alignment = getLoadStoreAlignment(I); 6686 Cost += VF.getKnownMinValue() * 6687 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6688 AS, TTI::TCK_RecipThroughput); 6689 6690 // Get the overhead of the extractelement and insertelement instructions 6691 // we might create due to scalarization. 6692 Cost += getScalarizationOverhead(I, VF); 6693 6694 // If we have a predicated store, it may not be executed for each vector 6695 // lane. Scale the cost by the probability of executing the predicated 6696 // block. 6697 if (isPredicatedInst(I)) { 6698 Cost /= getReciprocalPredBlockProb(); 6699 6700 if (useEmulatedMaskMemRefHack(I)) 6701 // Artificially setting to a high enough value to practically disable 6702 // vectorization with such operations. 6703 Cost = 3000000; 6704 } 6705 6706 return Cost; 6707 } 6708 6709 InstructionCost 6710 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6711 ElementCount VF) { 6712 Type *ValTy = getMemInstValueType(I); 6713 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6714 Value *Ptr = getLoadStorePointerOperand(I); 6715 unsigned AS = getLoadStoreAddressSpace(I); 6716 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6717 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6718 6719 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6720 "Stride should be 1 or -1 for consecutive memory access"); 6721 const Align Alignment = getLoadStoreAlignment(I); 6722 InstructionCost Cost = 0; 6723 if (Legal->isMaskRequired(I)) 6724 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6725 CostKind); 6726 else 6727 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6728 CostKind, I); 6729 6730 bool Reverse = ConsecutiveStride < 0; 6731 if (Reverse) 6732 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6733 return Cost; 6734 } 6735 6736 InstructionCost 6737 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6738 ElementCount VF) { 6739 assert(Legal->isUniformMemOp(*I)); 6740 6741 Type *ValTy = getMemInstValueType(I); 6742 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6743 const Align Alignment = getLoadStoreAlignment(I); 6744 unsigned AS = getLoadStoreAddressSpace(I); 6745 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6746 if (isa<LoadInst>(I)) { 6747 return TTI.getAddressComputationCost(ValTy) + 6748 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6749 CostKind) + 6750 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6751 } 6752 StoreInst *SI = cast<StoreInst>(I); 6753 6754 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6755 return TTI.getAddressComputationCost(ValTy) + 6756 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6757 CostKind) + 6758 (isLoopInvariantStoreValue 6759 ? 0 6760 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6761 VF.getKnownMinValue() - 1)); 6762 } 6763 6764 InstructionCost 6765 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6766 ElementCount VF) { 6767 Type *ValTy = getMemInstValueType(I); 6768 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6769 const Align Alignment = getLoadStoreAlignment(I); 6770 const Value *Ptr = getLoadStorePointerOperand(I); 6771 6772 return TTI.getAddressComputationCost(VectorTy) + 6773 TTI.getGatherScatterOpCost( 6774 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6775 TargetTransformInfo::TCK_RecipThroughput, I); 6776 } 6777 6778 InstructionCost 6779 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6780 ElementCount VF) { 6781 Type *ValTy = getMemInstValueType(I); 6782 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6783 unsigned AS = getLoadStoreAddressSpace(I); 6784 6785 auto Group = getInterleavedAccessGroup(I); 6786 assert(Group && "Fail to get an interleaved access group."); 6787 6788 unsigned InterleaveFactor = Group->getFactor(); 6789 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6790 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6791 6792 // Holds the indices of existing members in an interleaved load group. 6793 // An interleaved store group doesn't need this as it doesn't allow gaps. 6794 SmallVector<unsigned, 4> Indices; 6795 if (isa<LoadInst>(I)) { 6796 for (unsigned i = 0; i < InterleaveFactor; i++) 6797 if (Group->getMember(i)) 6798 Indices.push_back(i); 6799 } 6800 6801 // Calculate the cost of the whole interleaved group. 6802 bool UseMaskForGaps = 6803 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6804 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6805 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6806 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6807 6808 if (Group->isReverse()) { 6809 // TODO: Add support for reversed masked interleaved access. 6810 assert(!Legal->isMaskRequired(I) && 6811 "Reverse masked interleaved access not supported."); 6812 Cost += Group->getNumMembers() * 6813 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6814 } 6815 return Cost; 6816 } 6817 6818 InstructionCost 6819 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6820 ElementCount VF) { 6821 // Calculate scalar cost only. Vectorization cost should be ready at this 6822 // moment. 6823 if (VF.isScalar()) { 6824 Type *ValTy = getMemInstValueType(I); 6825 const Align Alignment = getLoadStoreAlignment(I); 6826 unsigned AS = getLoadStoreAddressSpace(I); 6827 6828 return TTI.getAddressComputationCost(ValTy) + 6829 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6830 TTI::TCK_RecipThroughput, I); 6831 } 6832 return getWideningCost(I, VF); 6833 } 6834 6835 LoopVectorizationCostModel::VectorizationCostTy 6836 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6837 ElementCount VF) { 6838 // If we know that this instruction will remain uniform, check the cost of 6839 // the scalar version. 6840 if (isUniformAfterVectorization(I, VF)) 6841 VF = ElementCount::getFixed(1); 6842 6843 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6844 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6845 6846 // Forced scalars do not have any scalarization overhead. 6847 auto ForcedScalar = ForcedScalars.find(VF); 6848 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6849 auto InstSet = ForcedScalar->second; 6850 if (InstSet.count(I)) 6851 return VectorizationCostTy( 6852 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6853 VF.getKnownMinValue()), 6854 false); 6855 } 6856 6857 Type *VectorTy; 6858 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6859 6860 bool TypeNotScalarized = 6861 VF.isVector() && VectorTy->isVectorTy() && 6862 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6863 return VectorizationCostTy(C, TypeNotScalarized); 6864 } 6865 6866 InstructionCost 6867 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6868 ElementCount VF) { 6869 6870 assert(!VF.isScalable() && 6871 "cannot compute scalarization overhead for scalable vectorization"); 6872 if (VF.isScalar()) 6873 return 0; 6874 6875 InstructionCost Cost = 0; 6876 Type *RetTy = ToVectorTy(I->getType(), VF); 6877 if (!RetTy->isVoidTy() && 6878 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6879 Cost += TTI.getScalarizationOverhead( 6880 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6881 true, false); 6882 6883 // Some targets keep addresses scalar. 6884 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6885 return Cost; 6886 6887 // Some targets support efficient element stores. 6888 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6889 return Cost; 6890 6891 // Collect operands to consider. 6892 CallInst *CI = dyn_cast<CallInst>(I); 6893 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6894 6895 // Skip operands that do not require extraction/scalarization and do not incur 6896 // any overhead. 6897 return Cost + TTI.getOperandsScalarizationOverhead( 6898 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6899 } 6900 6901 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6902 if (VF.isScalar()) 6903 return; 6904 NumPredStores = 0; 6905 for (BasicBlock *BB : TheLoop->blocks()) { 6906 // For each instruction in the old loop. 6907 for (Instruction &I : *BB) { 6908 Value *Ptr = getLoadStorePointerOperand(&I); 6909 if (!Ptr) 6910 continue; 6911 6912 // TODO: We should generate better code and update the cost model for 6913 // predicated uniform stores. Today they are treated as any other 6914 // predicated store (see added test cases in 6915 // invariant-store-vectorization.ll). 6916 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6917 NumPredStores++; 6918 6919 if (Legal->isUniformMemOp(I)) { 6920 // TODO: Avoid replicating loads and stores instead of 6921 // relying on instcombine to remove them. 6922 // Load: Scalar load + broadcast 6923 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6924 InstructionCost Cost = getUniformMemOpCost(&I, VF); 6925 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6926 continue; 6927 } 6928 6929 // We assume that widening is the best solution when possible. 6930 if (memoryInstructionCanBeWidened(&I, VF)) { 6931 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6932 int ConsecutiveStride = 6933 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6934 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6935 "Expected consecutive stride."); 6936 InstWidening Decision = 6937 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6938 setWideningDecision(&I, VF, Decision, Cost); 6939 continue; 6940 } 6941 6942 // Choose between Interleaving, Gather/Scatter or Scalarization. 6943 InstructionCost InterleaveCost = std::numeric_limits<int>::max(); 6944 unsigned NumAccesses = 1; 6945 if (isAccessInterleaved(&I)) { 6946 auto Group = getInterleavedAccessGroup(&I); 6947 assert(Group && "Fail to get an interleaved access group."); 6948 6949 // Make one decision for the whole group. 6950 if (getWideningDecision(&I, VF) != CM_Unknown) 6951 continue; 6952 6953 NumAccesses = Group->getNumMembers(); 6954 if (interleavedAccessCanBeWidened(&I, VF)) 6955 InterleaveCost = getInterleaveGroupCost(&I, VF); 6956 } 6957 6958 InstructionCost GatherScatterCost = 6959 isLegalGatherOrScatter(&I) 6960 ? getGatherScatterCost(&I, VF) * NumAccesses 6961 : std::numeric_limits<int>::max(); 6962 6963 InstructionCost ScalarizationCost = 6964 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6965 6966 // Choose better solution for the current VF, 6967 // write down this decision and use it during vectorization. 6968 InstructionCost Cost; 6969 InstWidening Decision; 6970 if (InterleaveCost <= GatherScatterCost && 6971 InterleaveCost < ScalarizationCost) { 6972 Decision = CM_Interleave; 6973 Cost = InterleaveCost; 6974 } else if (GatherScatterCost < ScalarizationCost) { 6975 Decision = CM_GatherScatter; 6976 Cost = GatherScatterCost; 6977 } else { 6978 Decision = CM_Scalarize; 6979 Cost = ScalarizationCost; 6980 } 6981 // If the instructions belongs to an interleave group, the whole group 6982 // receives the same decision. The whole group receives the cost, but 6983 // the cost will actually be assigned to one instruction. 6984 if (auto Group = getInterleavedAccessGroup(&I)) 6985 setWideningDecision(Group, VF, Decision, Cost); 6986 else 6987 setWideningDecision(&I, VF, Decision, Cost); 6988 } 6989 } 6990 6991 // Make sure that any load of address and any other address computation 6992 // remains scalar unless there is gather/scatter support. This avoids 6993 // inevitable extracts into address registers, and also has the benefit of 6994 // activating LSR more, since that pass can't optimize vectorized 6995 // addresses. 6996 if (TTI.prefersVectorizedAddressing()) 6997 return; 6998 6999 // Start with all scalar pointer uses. 7000 SmallPtrSet<Instruction *, 8> AddrDefs; 7001 for (BasicBlock *BB : TheLoop->blocks()) 7002 for (Instruction &I : *BB) { 7003 Instruction *PtrDef = 7004 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7005 if (PtrDef && TheLoop->contains(PtrDef) && 7006 getWideningDecision(&I, VF) != CM_GatherScatter) 7007 AddrDefs.insert(PtrDef); 7008 } 7009 7010 // Add all instructions used to generate the addresses. 7011 SmallVector<Instruction *, 4> Worklist; 7012 for (auto *I : AddrDefs) 7013 Worklist.push_back(I); 7014 while (!Worklist.empty()) { 7015 Instruction *I = Worklist.pop_back_val(); 7016 for (auto &Op : I->operands()) 7017 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7018 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7019 AddrDefs.insert(InstOp).second) 7020 Worklist.push_back(InstOp); 7021 } 7022 7023 for (auto *I : AddrDefs) { 7024 if (isa<LoadInst>(I)) { 7025 // Setting the desired widening decision should ideally be handled in 7026 // by cost functions, but since this involves the task of finding out 7027 // if the loaded register is involved in an address computation, it is 7028 // instead changed here when we know this is the case. 7029 InstWidening Decision = getWideningDecision(I, VF); 7030 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7031 // Scalarize a widened load of address. 7032 setWideningDecision( 7033 I, VF, CM_Scalarize, 7034 (VF.getKnownMinValue() * 7035 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7036 else if (auto Group = getInterleavedAccessGroup(I)) { 7037 // Scalarize an interleave group of address loads. 7038 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7039 if (Instruction *Member = Group->getMember(I)) 7040 setWideningDecision( 7041 Member, VF, CM_Scalarize, 7042 (VF.getKnownMinValue() * 7043 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7044 } 7045 } 7046 } else 7047 // Make sure I gets scalarized and a cost estimate without 7048 // scalarization overhead. 7049 ForcedScalars[VF].insert(I); 7050 } 7051 } 7052 7053 InstructionCost 7054 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7055 Type *&VectorTy) { 7056 Type *RetTy = I->getType(); 7057 if (canTruncateToMinimalBitwidth(I, VF)) 7058 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7059 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7060 auto SE = PSE.getSE(); 7061 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7062 7063 // TODO: We need to estimate the cost of intrinsic calls. 7064 switch (I->getOpcode()) { 7065 case Instruction::GetElementPtr: 7066 // We mark this instruction as zero-cost because the cost of GEPs in 7067 // vectorized code depends on whether the corresponding memory instruction 7068 // is scalarized or not. Therefore, we handle GEPs with the memory 7069 // instruction cost. 7070 return 0; 7071 case Instruction::Br: { 7072 // In cases of scalarized and predicated instructions, there will be VF 7073 // predicated blocks in the vectorized loop. Each branch around these 7074 // blocks requires also an extract of its vector compare i1 element. 7075 bool ScalarPredicatedBB = false; 7076 BranchInst *BI = cast<BranchInst>(I); 7077 if (VF.isVector() && BI->isConditional() && 7078 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7079 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7080 ScalarPredicatedBB = true; 7081 7082 if (ScalarPredicatedBB) { 7083 // Return cost for branches around scalarized and predicated blocks. 7084 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7085 auto *Vec_i1Ty = 7086 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7087 return (TTI.getScalarizationOverhead( 7088 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7089 false, true) + 7090 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7091 VF.getKnownMinValue())); 7092 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7093 // The back-edge branch will remain, as will all scalar branches. 7094 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7095 else 7096 // This branch will be eliminated by if-conversion. 7097 return 0; 7098 // Note: We currently assume zero cost for an unconditional branch inside 7099 // a predicated block since it will become a fall-through, although we 7100 // may decide in the future to call TTI for all branches. 7101 } 7102 case Instruction::PHI: { 7103 auto *Phi = cast<PHINode>(I); 7104 7105 // First-order recurrences are replaced by vector shuffles inside the loop. 7106 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7107 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7108 return TTI.getShuffleCost( 7109 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7110 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7111 7112 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7113 // converted into select instructions. We require N - 1 selects per phi 7114 // node, where N is the number of incoming values. 7115 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7116 return (Phi->getNumIncomingValues() - 1) * 7117 TTI.getCmpSelInstrCost( 7118 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7119 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7120 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7121 7122 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7123 } 7124 case Instruction::UDiv: 7125 case Instruction::SDiv: 7126 case Instruction::URem: 7127 case Instruction::SRem: 7128 // If we have a predicated instruction, it may not be executed for each 7129 // vector lane. Get the scalarization cost and scale this amount by the 7130 // probability of executing the predicated block. If the instruction is not 7131 // predicated, we fall through to the next case. 7132 if (VF.isVector() && isScalarWithPredication(I)) { 7133 InstructionCost Cost = 0; 7134 7135 // These instructions have a non-void type, so account for the phi nodes 7136 // that we will create. This cost is likely to be zero. The phi node 7137 // cost, if any, should be scaled by the block probability because it 7138 // models a copy at the end of each predicated block. 7139 Cost += VF.getKnownMinValue() * 7140 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7141 7142 // The cost of the non-predicated instruction. 7143 Cost += VF.getKnownMinValue() * 7144 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7145 7146 // The cost of insertelement and extractelement instructions needed for 7147 // scalarization. 7148 Cost += getScalarizationOverhead(I, VF); 7149 7150 // Scale the cost by the probability of executing the predicated blocks. 7151 // This assumes the predicated block for each vector lane is equally 7152 // likely. 7153 return Cost / getReciprocalPredBlockProb(); 7154 } 7155 LLVM_FALLTHROUGH; 7156 case Instruction::Add: 7157 case Instruction::FAdd: 7158 case Instruction::Sub: 7159 case Instruction::FSub: 7160 case Instruction::Mul: 7161 case Instruction::FMul: 7162 case Instruction::FDiv: 7163 case Instruction::FRem: 7164 case Instruction::Shl: 7165 case Instruction::LShr: 7166 case Instruction::AShr: 7167 case Instruction::And: 7168 case Instruction::Or: 7169 case Instruction::Xor: { 7170 // Since we will replace the stride by 1 the multiplication should go away. 7171 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7172 return 0; 7173 // Certain instructions can be cheaper to vectorize if they have a constant 7174 // second vector operand. One example of this are shifts on x86. 7175 Value *Op2 = I->getOperand(1); 7176 TargetTransformInfo::OperandValueProperties Op2VP; 7177 TargetTransformInfo::OperandValueKind Op2VK = 7178 TTI.getOperandInfo(Op2, Op2VP); 7179 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7180 Op2VK = TargetTransformInfo::OK_UniformValue; 7181 7182 SmallVector<const Value *, 4> Operands(I->operand_values()); 7183 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7184 return N * TTI.getArithmeticInstrCost( 7185 I->getOpcode(), VectorTy, CostKind, 7186 TargetTransformInfo::OK_AnyValue, 7187 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7188 } 7189 case Instruction::FNeg: { 7190 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7191 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7192 return N * TTI.getArithmeticInstrCost( 7193 I->getOpcode(), VectorTy, CostKind, 7194 TargetTransformInfo::OK_AnyValue, 7195 TargetTransformInfo::OK_AnyValue, 7196 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7197 I->getOperand(0), I); 7198 } 7199 case Instruction::Select: { 7200 SelectInst *SI = cast<SelectInst>(I); 7201 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7202 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7203 Type *CondTy = SI->getCondition()->getType(); 7204 if (!ScalarCond) { 7205 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7206 CondTy = VectorType::get(CondTy, VF); 7207 } 7208 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7209 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7210 } 7211 case Instruction::ICmp: 7212 case Instruction::FCmp: { 7213 Type *ValTy = I->getOperand(0)->getType(); 7214 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7215 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7216 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7217 VectorTy = ToVectorTy(ValTy, VF); 7218 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7219 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7220 } 7221 case Instruction::Store: 7222 case Instruction::Load: { 7223 ElementCount Width = VF; 7224 if (Width.isVector()) { 7225 InstWidening Decision = getWideningDecision(I, Width); 7226 assert(Decision != CM_Unknown && 7227 "CM decision should be taken at this point"); 7228 if (Decision == CM_Scalarize) 7229 Width = ElementCount::getFixed(1); 7230 } 7231 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7232 return getMemoryInstructionCost(I, VF); 7233 } 7234 case Instruction::ZExt: 7235 case Instruction::SExt: 7236 case Instruction::FPToUI: 7237 case Instruction::FPToSI: 7238 case Instruction::FPExt: 7239 case Instruction::PtrToInt: 7240 case Instruction::IntToPtr: 7241 case Instruction::SIToFP: 7242 case Instruction::UIToFP: 7243 case Instruction::Trunc: 7244 case Instruction::FPTrunc: 7245 case Instruction::BitCast: { 7246 // Computes the CastContextHint from a Load/Store instruction. 7247 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7248 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7249 "Expected a load or a store!"); 7250 7251 if (VF.isScalar() || !TheLoop->contains(I)) 7252 return TTI::CastContextHint::Normal; 7253 7254 switch (getWideningDecision(I, VF)) { 7255 case LoopVectorizationCostModel::CM_GatherScatter: 7256 return TTI::CastContextHint::GatherScatter; 7257 case LoopVectorizationCostModel::CM_Interleave: 7258 return TTI::CastContextHint::Interleave; 7259 case LoopVectorizationCostModel::CM_Scalarize: 7260 case LoopVectorizationCostModel::CM_Widen: 7261 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7262 : TTI::CastContextHint::Normal; 7263 case LoopVectorizationCostModel::CM_Widen_Reverse: 7264 return TTI::CastContextHint::Reversed; 7265 case LoopVectorizationCostModel::CM_Unknown: 7266 llvm_unreachable("Instr did not go through cost modelling?"); 7267 } 7268 7269 llvm_unreachable("Unhandled case!"); 7270 }; 7271 7272 unsigned Opcode = I->getOpcode(); 7273 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7274 // For Trunc, the context is the only user, which must be a StoreInst. 7275 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7276 if (I->hasOneUse()) 7277 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7278 CCH = ComputeCCH(Store); 7279 } 7280 // For Z/Sext, the context is the operand, which must be a LoadInst. 7281 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7282 Opcode == Instruction::FPExt) { 7283 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7284 CCH = ComputeCCH(Load); 7285 } 7286 7287 // We optimize the truncation of induction variables having constant 7288 // integer steps. The cost of these truncations is the same as the scalar 7289 // operation. 7290 if (isOptimizableIVTruncate(I, VF)) { 7291 auto *Trunc = cast<TruncInst>(I); 7292 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7293 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7294 } 7295 7296 Type *SrcScalarTy = I->getOperand(0)->getType(); 7297 Type *SrcVecTy = 7298 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7299 if (canTruncateToMinimalBitwidth(I, VF)) { 7300 // This cast is going to be shrunk. This may remove the cast or it might 7301 // turn it into slightly different cast. For example, if MinBW == 16, 7302 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7303 // 7304 // Calculate the modified src and dest types. 7305 Type *MinVecTy = VectorTy; 7306 if (Opcode == Instruction::Trunc) { 7307 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7308 VectorTy = 7309 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7310 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7311 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7312 VectorTy = 7313 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7314 } 7315 } 7316 7317 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7318 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7319 return N * 7320 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7321 } 7322 case Instruction::Call: { 7323 bool NeedToScalarize; 7324 CallInst *CI = cast<CallInst>(I); 7325 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7326 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7327 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7328 return std::min(CallCost, IntrinsicCost); 7329 } 7330 return CallCost; 7331 } 7332 case Instruction::ExtractValue: 7333 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7334 default: 7335 // The cost of executing VF copies of the scalar instruction. This opcode 7336 // is unknown. Assume that it is the same as 'mul'. 7337 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7338 Instruction::Mul, VectorTy, CostKind) + 7339 getScalarizationOverhead(I, VF); 7340 } // end of switch. 7341 } 7342 7343 char LoopVectorize::ID = 0; 7344 7345 static const char lv_name[] = "Loop Vectorization"; 7346 7347 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7348 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7349 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7350 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7351 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7352 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7353 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7354 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7355 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7356 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7357 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7358 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7359 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7360 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7361 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7362 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7363 7364 namespace llvm { 7365 7366 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7367 7368 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7369 bool VectorizeOnlyWhenForced) { 7370 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7371 } 7372 7373 } // end namespace llvm 7374 7375 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7376 // Check if the pointer operand of a load or store instruction is 7377 // consecutive. 7378 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7379 return Legal->isConsecutivePtr(Ptr); 7380 return false; 7381 } 7382 7383 void LoopVectorizationCostModel::collectValuesToIgnore() { 7384 // Ignore ephemeral values. 7385 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7386 7387 // Ignore type-promoting instructions we identified during reduction 7388 // detection. 7389 for (auto &Reduction : Legal->getReductionVars()) { 7390 RecurrenceDescriptor &RedDes = Reduction.second; 7391 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7392 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7393 } 7394 // Ignore type-casting instructions we identified during induction 7395 // detection. 7396 for (auto &Induction : Legal->getInductionVars()) { 7397 InductionDescriptor &IndDes = Induction.second; 7398 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7399 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7400 } 7401 } 7402 7403 void LoopVectorizationCostModel::collectInLoopReductions() { 7404 for (auto &Reduction : Legal->getReductionVars()) { 7405 PHINode *Phi = Reduction.first; 7406 RecurrenceDescriptor &RdxDesc = Reduction.second; 7407 7408 // We don't collect reductions that are type promoted (yet). 7409 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7410 continue; 7411 7412 // If the target would prefer this reduction to happen "in-loop", then we 7413 // want to record it as such. 7414 unsigned Opcode = RdxDesc.getOpcode(); 7415 if (!PreferInLoopReductions && 7416 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7417 TargetTransformInfo::ReductionFlags())) 7418 continue; 7419 7420 // Check that we can correctly put the reductions into the loop, by 7421 // finding the chain of operations that leads from the phi to the loop 7422 // exit value. 7423 SmallVector<Instruction *, 4> ReductionOperations = 7424 RdxDesc.getReductionOpChain(Phi, TheLoop); 7425 bool InLoop = !ReductionOperations.empty(); 7426 if (InLoop) 7427 InLoopReductionChains[Phi] = ReductionOperations; 7428 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7429 << " reduction for phi: " << *Phi << "\n"); 7430 } 7431 } 7432 7433 // TODO: we could return a pair of values that specify the max VF and 7434 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7435 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7436 // doesn't have a cost model that can choose which plan to execute if 7437 // more than one is generated. 7438 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7439 LoopVectorizationCostModel &CM) { 7440 unsigned WidestType; 7441 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7442 return WidestVectorRegBits / WidestType; 7443 } 7444 7445 VectorizationFactor 7446 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7447 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7448 ElementCount VF = UserVF; 7449 // Outer loop handling: They may require CFG and instruction level 7450 // transformations before even evaluating whether vectorization is profitable. 7451 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7452 // the vectorization pipeline. 7453 if (!OrigLoop->isInnermost()) { 7454 // If the user doesn't provide a vectorization factor, determine a 7455 // reasonable one. 7456 if (UserVF.isZero()) { 7457 VF = ElementCount::getFixed( 7458 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7459 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7460 7461 // Make sure we have a VF > 1 for stress testing. 7462 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7463 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7464 << "overriding computed VF.\n"); 7465 VF = ElementCount::getFixed(4); 7466 } 7467 } 7468 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7469 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7470 "VF needs to be a power of two"); 7471 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7472 << "VF " << VF << " to build VPlans.\n"); 7473 buildVPlans(VF, VF); 7474 7475 // For VPlan build stress testing, we bail out after VPlan construction. 7476 if (VPlanBuildStressTest) 7477 return VectorizationFactor::Disabled(); 7478 7479 return {VF, 0 /*Cost*/}; 7480 } 7481 7482 LLVM_DEBUG( 7483 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7484 "VPlan-native path.\n"); 7485 return VectorizationFactor::Disabled(); 7486 } 7487 7488 Optional<VectorizationFactor> 7489 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7490 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7491 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7492 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7493 return None; 7494 7495 // Invalidate interleave groups if all blocks of loop will be predicated. 7496 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7497 !useMaskedInterleavedAccesses(*TTI)) { 7498 LLVM_DEBUG( 7499 dbgs() 7500 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7501 "which requires masked-interleaved support.\n"); 7502 if (CM.InterleaveInfo.invalidateGroups()) 7503 // Invalidating interleave groups also requires invalidating all decisions 7504 // based on them, which includes widening decisions and uniform and scalar 7505 // values. 7506 CM.invalidateCostModelingDecisions(); 7507 } 7508 7509 ElementCount MaxVF = MaybeMaxVF.getValue(); 7510 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7511 7512 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7513 if (!UserVF.isZero() && 7514 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7515 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7516 // VFs here, this should be reverted to only use legal UserVFs once the 7517 // loop below supports scalable VFs. 7518 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7519 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7520 << " VF " << VF << ".\n"); 7521 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7522 "VF needs to be a power of two"); 7523 // Collect the instructions (and their associated costs) that will be more 7524 // profitable to scalarize. 7525 CM.selectUserVectorizationFactor(VF); 7526 CM.collectInLoopReductions(); 7527 buildVPlansWithVPRecipes(VF, VF); 7528 LLVM_DEBUG(printPlans(dbgs())); 7529 return {{VF, 0}}; 7530 } 7531 7532 assert(!MaxVF.isScalable() && 7533 "Scalable vectors not yet supported beyond this point"); 7534 7535 for (ElementCount VF = ElementCount::getFixed(1); 7536 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7537 // Collect Uniform and Scalar instructions after vectorization with VF. 7538 CM.collectUniformsAndScalars(VF); 7539 7540 // Collect the instructions (and their associated costs) that will be more 7541 // profitable to scalarize. 7542 if (VF.isVector()) 7543 CM.collectInstsToScalarize(VF); 7544 } 7545 7546 CM.collectInLoopReductions(); 7547 7548 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7549 LLVM_DEBUG(printPlans(dbgs())); 7550 if (MaxVF.isScalar()) 7551 return VectorizationFactor::Disabled(); 7552 7553 // Select the optimal vectorization factor. 7554 return CM.selectVectorizationFactor(MaxVF); 7555 } 7556 7557 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7558 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7559 << '\n'); 7560 BestVF = VF; 7561 BestUF = UF; 7562 7563 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7564 return !Plan->hasVF(VF); 7565 }); 7566 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7567 } 7568 7569 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7570 DominatorTree *DT) { 7571 // Perform the actual loop transformation. 7572 7573 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7574 VPCallbackILV CallbackILV(ILV); 7575 7576 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7577 7578 VPTransformState State{*BestVF, BestUF, LI, 7579 DT, ILV.Builder, ILV.VectorLoopValueMap, 7580 &ILV, CallbackILV}; 7581 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7582 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7583 State.CanonicalIV = ILV.Induction; 7584 7585 ILV.printDebugTracesAtStart(); 7586 7587 //===------------------------------------------------===// 7588 // 7589 // Notice: any optimization or new instruction that go 7590 // into the code below should also be implemented in 7591 // the cost-model. 7592 // 7593 //===------------------------------------------------===// 7594 7595 // 2. Copy and widen instructions from the old loop into the new loop. 7596 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7597 VPlans.front()->execute(&State); 7598 7599 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7600 // predication, updating analyses. 7601 ILV.fixVectorizedLoop(); 7602 7603 ILV.printDebugTracesAtEnd(); 7604 } 7605 7606 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7607 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7608 7609 // We create new control-flow for the vectorized loop, so the original exit 7610 // conditions will be dead after vectorization if it's only used by the 7611 // terminator 7612 SmallVector<BasicBlock*> ExitingBlocks; 7613 OrigLoop->getExitingBlocks(ExitingBlocks); 7614 for (auto *BB : ExitingBlocks) { 7615 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7616 if (!Cmp || !Cmp->hasOneUse()) 7617 continue; 7618 7619 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7620 if (!DeadInstructions.insert(Cmp).second) 7621 continue; 7622 7623 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7624 // TODO: can recurse through operands in general 7625 for (Value *Op : Cmp->operands()) { 7626 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7627 DeadInstructions.insert(cast<Instruction>(Op)); 7628 } 7629 } 7630 7631 // We create new "steps" for induction variable updates to which the original 7632 // induction variables map. An original update instruction will be dead if 7633 // all its users except the induction variable are dead. 7634 auto *Latch = OrigLoop->getLoopLatch(); 7635 for (auto &Induction : Legal->getInductionVars()) { 7636 PHINode *Ind = Induction.first; 7637 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7638 7639 // If the tail is to be folded by masking, the primary induction variable, 7640 // if exists, isn't dead: it will be used for masking. Don't kill it. 7641 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7642 continue; 7643 7644 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7645 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7646 })) 7647 DeadInstructions.insert(IndUpdate); 7648 7649 // We record as "Dead" also the type-casting instructions we had identified 7650 // during induction analysis. We don't need any handling for them in the 7651 // vectorized loop because we have proven that, under a proper runtime 7652 // test guarding the vectorized loop, the value of the phi, and the casted 7653 // value of the phi, are the same. The last instruction in this casting chain 7654 // will get its scalar/vector/widened def from the scalar/vector/widened def 7655 // of the respective phi node. Any other casts in the induction def-use chain 7656 // have no other uses outside the phi update chain, and will be ignored. 7657 InductionDescriptor &IndDes = Induction.second; 7658 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7659 DeadInstructions.insert(Casts.begin(), Casts.end()); 7660 } 7661 } 7662 7663 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7664 7665 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7666 7667 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7668 Instruction::BinaryOps BinOp) { 7669 // When unrolling and the VF is 1, we only need to add a simple scalar. 7670 Type *Ty = Val->getType(); 7671 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7672 7673 if (Ty->isFloatingPointTy()) { 7674 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7675 7676 // Floating point operations had to be 'fast' to enable the unrolling. 7677 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7678 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7679 } 7680 Constant *C = ConstantInt::get(Ty, StartIdx); 7681 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7682 } 7683 7684 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7685 SmallVector<Metadata *, 4> MDs; 7686 // Reserve first location for self reference to the LoopID metadata node. 7687 MDs.push_back(nullptr); 7688 bool IsUnrollMetadata = false; 7689 MDNode *LoopID = L->getLoopID(); 7690 if (LoopID) { 7691 // First find existing loop unrolling disable metadata. 7692 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7693 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7694 if (MD) { 7695 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7696 IsUnrollMetadata = 7697 S && S->getString().startswith("llvm.loop.unroll.disable"); 7698 } 7699 MDs.push_back(LoopID->getOperand(i)); 7700 } 7701 } 7702 7703 if (!IsUnrollMetadata) { 7704 // Add runtime unroll disable metadata. 7705 LLVMContext &Context = L->getHeader()->getContext(); 7706 SmallVector<Metadata *, 1> DisableOperands; 7707 DisableOperands.push_back( 7708 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7709 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7710 MDs.push_back(DisableNode); 7711 MDNode *NewLoopID = MDNode::get(Context, MDs); 7712 // Set operand 0 to refer to the loop id itself. 7713 NewLoopID->replaceOperandWith(0, NewLoopID); 7714 L->setLoopID(NewLoopID); 7715 } 7716 } 7717 7718 //===--------------------------------------------------------------------===// 7719 // EpilogueVectorizerMainLoop 7720 //===--------------------------------------------------------------------===// 7721 7722 /// This function is partially responsible for generating the control flow 7723 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7724 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7725 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7726 Loop *Lp = createVectorLoopSkeleton(""); 7727 7728 // Generate the code to check the minimum iteration count of the vector 7729 // epilogue (see below). 7730 EPI.EpilogueIterationCountCheck = 7731 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7732 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7733 7734 // Generate the code to check any assumptions that we've made for SCEV 7735 // expressions. 7736 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7737 emitSCEVChecks(Lp, LoopScalarPreHeader); 7738 7739 // If a safety check was generated save it. 7740 if (SavedPreHeader != LoopVectorPreHeader) 7741 EPI.SCEVSafetyCheck = SavedPreHeader; 7742 7743 // Generate the code that checks at runtime if arrays overlap. We put the 7744 // checks into a separate block to make the more common case of few elements 7745 // faster. 7746 SavedPreHeader = LoopVectorPreHeader; 7747 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7748 7749 // If a safety check was generated save/overwite it. 7750 if (SavedPreHeader != LoopVectorPreHeader) 7751 EPI.MemSafetyCheck = SavedPreHeader; 7752 7753 // Generate the iteration count check for the main loop, *after* the check 7754 // for the epilogue loop, so that the path-length is shorter for the case 7755 // that goes directly through the vector epilogue. The longer-path length for 7756 // the main loop is compensated for, by the gain from vectorizing the larger 7757 // trip count. Note: the branch will get updated later on when we vectorize 7758 // the epilogue. 7759 EPI.MainLoopIterationCountCheck = 7760 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7761 7762 // Generate the induction variable. 7763 OldInduction = Legal->getPrimaryInduction(); 7764 Type *IdxTy = Legal->getWidestInductionType(); 7765 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7766 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7767 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7768 EPI.VectorTripCount = CountRoundDown; 7769 Induction = 7770 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7771 getDebugLocFromInstOrOperands(OldInduction)); 7772 7773 // Skip induction resume value creation here because they will be created in 7774 // the second pass. If we created them here, they wouldn't be used anyway, 7775 // because the vplan in the second pass still contains the inductions from the 7776 // original loop. 7777 7778 return completeLoopSkeleton(Lp, OrigLoopID); 7779 } 7780 7781 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7782 LLVM_DEBUG({ 7783 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7784 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7785 << ", Main Loop UF:" << EPI.MainLoopUF 7786 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7787 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7788 }); 7789 } 7790 7791 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7792 DEBUG_WITH_TYPE(VerboseDebug, { 7793 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7794 }); 7795 } 7796 7797 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7798 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7799 assert(L && "Expected valid Loop."); 7800 assert(Bypass && "Expected valid bypass basic block."); 7801 unsigned VFactor = 7802 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7803 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7804 Value *Count = getOrCreateTripCount(L); 7805 // Reuse existing vector loop preheader for TC checks. 7806 // Note that new preheader block is generated for vector loop. 7807 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7808 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7809 7810 // Generate code to check if the loop's trip count is less than VF * UF of the 7811 // main vector loop. 7812 auto P = 7813 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7814 7815 Value *CheckMinIters = Builder.CreateICmp( 7816 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7817 "min.iters.check"); 7818 7819 if (!ForEpilogue) 7820 TCCheckBlock->setName("vector.main.loop.iter.check"); 7821 7822 // Create new preheader for vector loop. 7823 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7824 DT, LI, nullptr, "vector.ph"); 7825 7826 if (ForEpilogue) { 7827 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7828 DT->getNode(Bypass)->getIDom()) && 7829 "TC check is expected to dominate Bypass"); 7830 7831 // Update dominator for Bypass & LoopExit. 7832 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7833 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7834 7835 LoopBypassBlocks.push_back(TCCheckBlock); 7836 7837 // Save the trip count so we don't have to regenerate it in the 7838 // vec.epilog.iter.check. This is safe to do because the trip count 7839 // generated here dominates the vector epilog iter check. 7840 EPI.TripCount = Count; 7841 } 7842 7843 ReplaceInstWithInst( 7844 TCCheckBlock->getTerminator(), 7845 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7846 7847 return TCCheckBlock; 7848 } 7849 7850 //===--------------------------------------------------------------------===// 7851 // EpilogueVectorizerEpilogueLoop 7852 //===--------------------------------------------------------------------===// 7853 7854 /// This function is partially responsible for generating the control flow 7855 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7856 BasicBlock * 7857 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7858 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7859 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7860 7861 // Now, compare the remaining count and if there aren't enough iterations to 7862 // execute the vectorized epilogue skip to the scalar part. 7863 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7864 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7865 LoopVectorPreHeader = 7866 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7867 LI, nullptr, "vec.epilog.ph"); 7868 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7869 VecEpilogueIterationCountCheck); 7870 7871 // Adjust the control flow taking the state info from the main loop 7872 // vectorization into account. 7873 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7874 "expected this to be saved from the previous pass."); 7875 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7876 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7877 7878 DT->changeImmediateDominator(LoopVectorPreHeader, 7879 EPI.MainLoopIterationCountCheck); 7880 7881 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7882 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7883 7884 if (EPI.SCEVSafetyCheck) 7885 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7886 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7887 if (EPI.MemSafetyCheck) 7888 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7889 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7890 7891 DT->changeImmediateDominator( 7892 VecEpilogueIterationCountCheck, 7893 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7894 7895 DT->changeImmediateDominator(LoopScalarPreHeader, 7896 EPI.EpilogueIterationCountCheck); 7897 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7898 7899 // Keep track of bypass blocks, as they feed start values to the induction 7900 // phis in the scalar loop preheader. 7901 if (EPI.SCEVSafetyCheck) 7902 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7903 if (EPI.MemSafetyCheck) 7904 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7905 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7906 7907 // Generate a resume induction for the vector epilogue and put it in the 7908 // vector epilogue preheader 7909 Type *IdxTy = Legal->getWidestInductionType(); 7910 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7911 LoopVectorPreHeader->getFirstNonPHI()); 7912 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7913 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7914 EPI.MainLoopIterationCountCheck); 7915 7916 // Generate the induction variable. 7917 OldInduction = Legal->getPrimaryInduction(); 7918 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7919 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7920 Value *StartIdx = EPResumeVal; 7921 Induction = 7922 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7923 getDebugLocFromInstOrOperands(OldInduction)); 7924 7925 // Generate induction resume values. These variables save the new starting 7926 // indexes for the scalar loop. They are used to test if there are any tail 7927 // iterations left once the vector loop has completed. 7928 // Note that when the vectorized epilogue is skipped due to iteration count 7929 // check, then the resume value for the induction variable comes from 7930 // the trip count of the main vector loop, hence passing the AdditionalBypass 7931 // argument. 7932 createInductionResumeValues(Lp, CountRoundDown, 7933 {VecEpilogueIterationCountCheck, 7934 EPI.VectorTripCount} /* AdditionalBypass */); 7935 7936 AddRuntimeUnrollDisableMetaData(Lp); 7937 return completeLoopSkeleton(Lp, OrigLoopID); 7938 } 7939 7940 BasicBlock * 7941 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7942 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7943 7944 assert(EPI.TripCount && 7945 "Expected trip count to have been safed in the first pass."); 7946 assert( 7947 (!isa<Instruction>(EPI.TripCount) || 7948 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7949 "saved trip count does not dominate insertion point."); 7950 Value *TC = EPI.TripCount; 7951 IRBuilder<> Builder(Insert->getTerminator()); 7952 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7953 7954 // Generate code to check if the loop's trip count is less than VF * UF of the 7955 // vector epilogue loop. 7956 auto P = 7957 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7958 7959 Value *CheckMinIters = Builder.CreateICmp( 7960 P, Count, 7961 ConstantInt::get(Count->getType(), 7962 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7963 "min.epilog.iters.check"); 7964 7965 ReplaceInstWithInst( 7966 Insert->getTerminator(), 7967 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7968 7969 LoopBypassBlocks.push_back(Insert); 7970 return Insert; 7971 } 7972 7973 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7974 LLVM_DEBUG({ 7975 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7976 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7977 << ", Main Loop UF:" << EPI.MainLoopUF 7978 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7979 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7980 }); 7981 } 7982 7983 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7984 DEBUG_WITH_TYPE(VerboseDebug, { 7985 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7986 }); 7987 } 7988 7989 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7990 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7991 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7992 bool PredicateAtRangeStart = Predicate(Range.Start); 7993 7994 for (ElementCount TmpVF = Range.Start * 2; 7995 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7996 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7997 Range.End = TmpVF; 7998 break; 7999 } 8000 8001 return PredicateAtRangeStart; 8002 } 8003 8004 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8005 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8006 /// of VF's starting at a given VF and extending it as much as possible. Each 8007 /// vectorization decision can potentially shorten this sub-range during 8008 /// buildVPlan(). 8009 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8010 ElementCount MaxVF) { 8011 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8012 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8013 VFRange SubRange = {VF, MaxVFPlusOne}; 8014 VPlans.push_back(buildVPlan(SubRange)); 8015 VF = SubRange.End; 8016 } 8017 } 8018 8019 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8020 VPlanPtr &Plan) { 8021 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8022 8023 // Look for cached value. 8024 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8025 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8026 if (ECEntryIt != EdgeMaskCache.end()) 8027 return ECEntryIt->second; 8028 8029 VPValue *SrcMask = createBlockInMask(Src, Plan); 8030 8031 // The terminator has to be a branch inst! 8032 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8033 assert(BI && "Unexpected terminator found"); 8034 8035 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8036 return EdgeMaskCache[Edge] = SrcMask; 8037 8038 // If source is an exiting block, we know the exit edge is dynamically dead 8039 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8040 // adding uses of an otherwise potentially dead instruction. 8041 if (OrigLoop->isLoopExiting(Src)) 8042 return EdgeMaskCache[Edge] = SrcMask; 8043 8044 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8045 assert(EdgeMask && "No Edge Mask found for condition"); 8046 8047 if (BI->getSuccessor(0) != Dst) 8048 EdgeMask = Builder.createNot(EdgeMask); 8049 8050 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8051 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8052 8053 return EdgeMaskCache[Edge] = EdgeMask; 8054 } 8055 8056 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8057 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8058 8059 // Look for cached value. 8060 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8061 if (BCEntryIt != BlockMaskCache.end()) 8062 return BCEntryIt->second; 8063 8064 // All-one mask is modelled as no-mask following the convention for masked 8065 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8066 VPValue *BlockMask = nullptr; 8067 8068 if (OrigLoop->getHeader() == BB) { 8069 if (!CM.blockNeedsPredication(BB)) 8070 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8071 8072 // Create the block in mask as the first non-phi instruction in the block. 8073 VPBuilder::InsertPointGuard Guard(Builder); 8074 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8075 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8076 8077 // Introduce the early-exit compare IV <= BTC to form header block mask. 8078 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8079 // Start by constructing the desired canonical IV. 8080 VPValue *IV = nullptr; 8081 if (Legal->getPrimaryInduction()) 8082 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8083 else { 8084 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8085 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8086 IV = IVRecipe->getVPValue(); 8087 } 8088 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8089 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8090 8091 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8092 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8093 // as a second argument, we only pass the IV here and extract the 8094 // tripcount from the transform state where codegen of the VP instructions 8095 // happen. 8096 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8097 } else { 8098 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8099 } 8100 return BlockMaskCache[BB] = BlockMask; 8101 } 8102 8103 // This is the block mask. We OR all incoming edges. 8104 for (auto *Predecessor : predecessors(BB)) { 8105 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8106 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8107 return BlockMaskCache[BB] = EdgeMask; 8108 8109 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8110 BlockMask = EdgeMask; 8111 continue; 8112 } 8113 8114 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8115 } 8116 8117 return BlockMaskCache[BB] = BlockMask; 8118 } 8119 8120 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8121 VPlanPtr &Plan) { 8122 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8123 "Must be called with either a load or store"); 8124 8125 auto willWiden = [&](ElementCount VF) -> bool { 8126 if (VF.isScalar()) 8127 return false; 8128 LoopVectorizationCostModel::InstWidening Decision = 8129 CM.getWideningDecision(I, VF); 8130 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8131 "CM decision should be taken at this point."); 8132 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8133 return true; 8134 if (CM.isScalarAfterVectorization(I, VF) || 8135 CM.isProfitableToScalarize(I, VF)) 8136 return false; 8137 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8138 }; 8139 8140 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8141 return nullptr; 8142 8143 VPValue *Mask = nullptr; 8144 if (Legal->isMaskRequired(I)) 8145 Mask = createBlockInMask(I->getParent(), Plan); 8146 8147 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8148 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8149 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8150 8151 StoreInst *Store = cast<StoreInst>(I); 8152 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8153 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8154 } 8155 8156 VPWidenIntOrFpInductionRecipe * 8157 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8158 // Check if this is an integer or fp induction. If so, build the recipe that 8159 // produces its scalar and vector values. 8160 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8161 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8162 II.getKind() == InductionDescriptor::IK_FpInduction) { 8163 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8164 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8165 } 8166 8167 return nullptr; 8168 } 8169 8170 VPWidenIntOrFpInductionRecipe * 8171 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8172 VPlan &Plan) const { 8173 // Optimize the special case where the source is a constant integer 8174 // induction variable. Notice that we can only optimize the 'trunc' case 8175 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8176 // (c) other casts depend on pointer size. 8177 8178 // Determine whether \p K is a truncation based on an induction variable that 8179 // can be optimized. 8180 auto isOptimizableIVTruncate = 8181 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8182 return [=](ElementCount VF) -> bool { 8183 return CM.isOptimizableIVTruncate(K, VF); 8184 }; 8185 }; 8186 8187 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8188 isOptimizableIVTruncate(I), Range)) { 8189 8190 InductionDescriptor II = 8191 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8192 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8193 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8194 Start, I); 8195 } 8196 return nullptr; 8197 } 8198 8199 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8200 // We know that all PHIs in non-header blocks are converted into selects, so 8201 // we don't have to worry about the insertion order and we can just use the 8202 // builder. At this point we generate the predication tree. There may be 8203 // duplications since this is a simple recursive scan, but future 8204 // optimizations will clean it up. 8205 8206 SmallVector<VPValue *, 2> Operands; 8207 unsigned NumIncoming = Phi->getNumIncomingValues(); 8208 for (unsigned In = 0; In < NumIncoming; In++) { 8209 VPValue *EdgeMask = 8210 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8211 assert((EdgeMask || NumIncoming == 1) && 8212 "Multiple predecessors with one having a full mask"); 8213 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8214 if (EdgeMask) 8215 Operands.push_back(EdgeMask); 8216 } 8217 return new VPBlendRecipe(Phi, Operands); 8218 } 8219 8220 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8221 VPlan &Plan) const { 8222 8223 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8224 [this, CI](ElementCount VF) { 8225 return CM.isScalarWithPredication(CI, VF); 8226 }, 8227 Range); 8228 8229 if (IsPredicated) 8230 return nullptr; 8231 8232 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8233 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8234 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8235 ID == Intrinsic::pseudoprobe || 8236 ID == Intrinsic::experimental_noalias_scope_decl)) 8237 return nullptr; 8238 8239 auto willWiden = [&](ElementCount VF) -> bool { 8240 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8241 // The following case may be scalarized depending on the VF. 8242 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8243 // version of the instruction. 8244 // Is it beneficial to perform intrinsic call compared to lib call? 8245 bool NeedToScalarize = false; 8246 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8247 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8248 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8249 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8250 "Cannot have invalid costs while widening"); 8251 return UseVectorIntrinsic || !NeedToScalarize; 8252 }; 8253 8254 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8255 return nullptr; 8256 8257 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8258 } 8259 8260 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8261 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8262 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8263 // Instruction should be widened, unless it is scalar after vectorization, 8264 // scalarization is profitable or it is predicated. 8265 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8266 return CM.isScalarAfterVectorization(I, VF) || 8267 CM.isProfitableToScalarize(I, VF) || 8268 CM.isScalarWithPredication(I, VF); 8269 }; 8270 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8271 Range); 8272 } 8273 8274 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8275 auto IsVectorizableOpcode = [](unsigned Opcode) { 8276 switch (Opcode) { 8277 case Instruction::Add: 8278 case Instruction::And: 8279 case Instruction::AShr: 8280 case Instruction::BitCast: 8281 case Instruction::FAdd: 8282 case Instruction::FCmp: 8283 case Instruction::FDiv: 8284 case Instruction::FMul: 8285 case Instruction::FNeg: 8286 case Instruction::FPExt: 8287 case Instruction::FPToSI: 8288 case Instruction::FPToUI: 8289 case Instruction::FPTrunc: 8290 case Instruction::FRem: 8291 case Instruction::FSub: 8292 case Instruction::ICmp: 8293 case Instruction::IntToPtr: 8294 case Instruction::LShr: 8295 case Instruction::Mul: 8296 case Instruction::Or: 8297 case Instruction::PtrToInt: 8298 case Instruction::SDiv: 8299 case Instruction::Select: 8300 case Instruction::SExt: 8301 case Instruction::Shl: 8302 case Instruction::SIToFP: 8303 case Instruction::SRem: 8304 case Instruction::Sub: 8305 case Instruction::Trunc: 8306 case Instruction::UDiv: 8307 case Instruction::UIToFP: 8308 case Instruction::URem: 8309 case Instruction::Xor: 8310 case Instruction::ZExt: 8311 return true; 8312 } 8313 return false; 8314 }; 8315 8316 if (!IsVectorizableOpcode(I->getOpcode())) 8317 return nullptr; 8318 8319 // Success: widen this instruction. 8320 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8321 } 8322 8323 VPBasicBlock *VPRecipeBuilder::handleReplication( 8324 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8325 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8326 VPlanPtr &Plan) { 8327 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8328 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8329 Range); 8330 8331 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8332 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8333 Range); 8334 8335 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8336 IsUniform, IsPredicated); 8337 setRecipe(I, Recipe); 8338 Plan->addVPValue(I, Recipe); 8339 8340 // Find if I uses a predicated instruction. If so, it will use its scalar 8341 // value. Avoid hoisting the insert-element which packs the scalar value into 8342 // a vector value, as that happens iff all users use the vector value. 8343 for (auto &Op : I->operands()) 8344 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8345 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8346 PredInst2Recipe[PredInst]->setAlsoPack(false); 8347 8348 // Finalize the recipe for Instr, first if it is not predicated. 8349 if (!IsPredicated) { 8350 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8351 VPBB->appendRecipe(Recipe); 8352 return VPBB; 8353 } 8354 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8355 assert(VPBB->getSuccessors().empty() && 8356 "VPBB has successors when handling predicated replication."); 8357 // Record predicated instructions for above packing optimizations. 8358 PredInst2Recipe[I] = Recipe; 8359 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8360 VPBlockUtils::insertBlockAfter(Region, VPBB); 8361 auto *RegSucc = new VPBasicBlock(); 8362 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8363 return RegSucc; 8364 } 8365 8366 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8367 VPRecipeBase *PredRecipe, 8368 VPlanPtr &Plan) { 8369 // Instructions marked for predication are replicated and placed under an 8370 // if-then construct to prevent side-effects. 8371 8372 // Generate recipes to compute the block mask for this region. 8373 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8374 8375 // Build the triangular if-then region. 8376 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8377 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8378 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8379 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8380 auto *PHIRecipe = Instr->getType()->isVoidTy() 8381 ? nullptr 8382 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8383 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8384 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8385 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8386 8387 // Note: first set Entry as region entry and then connect successors starting 8388 // from it in order, to propagate the "parent" of each VPBasicBlock. 8389 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8390 VPBlockUtils::connectBlocks(Pred, Exit); 8391 8392 return Region; 8393 } 8394 8395 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8396 VFRange &Range, 8397 VPlanPtr &Plan) { 8398 // First, check for specific widening recipes that deal with calls, memory 8399 // operations, inductions and Phi nodes. 8400 if (auto *CI = dyn_cast<CallInst>(Instr)) 8401 return tryToWidenCall(CI, Range, *Plan); 8402 8403 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8404 return tryToWidenMemory(Instr, Range, Plan); 8405 8406 VPRecipeBase *Recipe; 8407 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8408 if (Phi->getParent() != OrigLoop->getHeader()) 8409 return tryToBlend(Phi, Plan); 8410 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8411 return Recipe; 8412 8413 if (Legal->isReductionVariable(Phi)) { 8414 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8415 VPValue *StartV = 8416 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8417 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8418 } 8419 8420 return new VPWidenPHIRecipe(Phi); 8421 } 8422 8423 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8424 cast<TruncInst>(Instr), Range, *Plan))) 8425 return Recipe; 8426 8427 if (!shouldWiden(Instr, Range)) 8428 return nullptr; 8429 8430 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8431 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8432 OrigLoop); 8433 8434 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8435 bool InvariantCond = 8436 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8437 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8438 InvariantCond); 8439 } 8440 8441 return tryToWiden(Instr, *Plan); 8442 } 8443 8444 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8445 ElementCount MaxVF) { 8446 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8447 8448 // Collect instructions from the original loop that will become trivially dead 8449 // in the vectorized loop. We don't need to vectorize these instructions. For 8450 // example, original induction update instructions can become dead because we 8451 // separately emit induction "steps" when generating code for the new loop. 8452 // Similarly, we create a new latch condition when setting up the structure 8453 // of the new loop, so the old one can become dead. 8454 SmallPtrSet<Instruction *, 4> DeadInstructions; 8455 collectTriviallyDeadInstructions(DeadInstructions); 8456 8457 // Add assume instructions we need to drop to DeadInstructions, to prevent 8458 // them from being added to the VPlan. 8459 // TODO: We only need to drop assumes in blocks that get flattend. If the 8460 // control flow is preserved, we should keep them. 8461 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8462 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8463 8464 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8465 // Dead instructions do not need sinking. Remove them from SinkAfter. 8466 for (Instruction *I : DeadInstructions) 8467 SinkAfter.erase(I); 8468 8469 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8470 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8471 VFRange SubRange = {VF, MaxVFPlusOne}; 8472 VPlans.push_back( 8473 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8474 VF = SubRange.End; 8475 } 8476 } 8477 8478 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8479 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8480 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8481 8482 // Hold a mapping from predicated instructions to their recipes, in order to 8483 // fix their AlsoPack behavior if a user is determined to replicate and use a 8484 // scalar instead of vector value. 8485 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8486 8487 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8488 8489 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8490 8491 // --------------------------------------------------------------------------- 8492 // Pre-construction: record ingredients whose recipes we'll need to further 8493 // process after constructing the initial VPlan. 8494 // --------------------------------------------------------------------------- 8495 8496 // Mark instructions we'll need to sink later and their targets as 8497 // ingredients whose recipe we'll need to record. 8498 for (auto &Entry : SinkAfter) { 8499 RecipeBuilder.recordRecipeOf(Entry.first); 8500 RecipeBuilder.recordRecipeOf(Entry.second); 8501 } 8502 for (auto &Reduction : CM.getInLoopReductionChains()) { 8503 PHINode *Phi = Reduction.first; 8504 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8505 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8506 8507 RecipeBuilder.recordRecipeOf(Phi); 8508 for (auto &R : ReductionOperations) { 8509 RecipeBuilder.recordRecipeOf(R); 8510 // For min/max reducitons, where we have a pair of icmp/select, we also 8511 // need to record the ICmp recipe, so it can be removed later. 8512 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8513 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8514 } 8515 } 8516 8517 // For each interleave group which is relevant for this (possibly trimmed) 8518 // Range, add it to the set of groups to be later applied to the VPlan and add 8519 // placeholders for its members' Recipes which we'll be replacing with a 8520 // single VPInterleaveRecipe. 8521 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8522 auto applyIG = [IG, this](ElementCount VF) -> bool { 8523 return (VF.isVector() && // Query is illegal for VF == 1 8524 CM.getWideningDecision(IG->getInsertPos(), VF) == 8525 LoopVectorizationCostModel::CM_Interleave); 8526 }; 8527 if (!getDecisionAndClampRange(applyIG, Range)) 8528 continue; 8529 InterleaveGroups.insert(IG); 8530 for (unsigned i = 0; i < IG->getFactor(); i++) 8531 if (Instruction *Member = IG->getMember(i)) 8532 RecipeBuilder.recordRecipeOf(Member); 8533 }; 8534 8535 // --------------------------------------------------------------------------- 8536 // Build initial VPlan: Scan the body of the loop in a topological order to 8537 // visit each basic block after having visited its predecessor basic blocks. 8538 // --------------------------------------------------------------------------- 8539 8540 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8541 auto Plan = std::make_unique<VPlan>(); 8542 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8543 Plan->setEntry(VPBB); 8544 8545 // Scan the body of the loop in a topological order to visit each basic block 8546 // after having visited its predecessor basic blocks. 8547 LoopBlocksDFS DFS(OrigLoop); 8548 DFS.perform(LI); 8549 8550 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8551 // Relevant instructions from basic block BB will be grouped into VPRecipe 8552 // ingredients and fill a new VPBasicBlock. 8553 unsigned VPBBsForBB = 0; 8554 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8555 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8556 VPBB = FirstVPBBForBB; 8557 Builder.setInsertPoint(VPBB); 8558 8559 // Introduce each ingredient into VPlan. 8560 // TODO: Model and preserve debug instrinsics in VPlan. 8561 for (Instruction &I : BB->instructionsWithoutDebug()) { 8562 Instruction *Instr = &I; 8563 8564 // First filter out irrelevant instructions, to ensure no recipes are 8565 // built for them. 8566 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8567 continue; 8568 8569 if (auto Recipe = 8570 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8571 for (auto *Def : Recipe->definedValues()) { 8572 auto *UV = Def->getUnderlyingValue(); 8573 Plan->addVPValue(UV, Def); 8574 } 8575 8576 RecipeBuilder.setRecipe(Instr, Recipe); 8577 VPBB->appendRecipe(Recipe); 8578 continue; 8579 } 8580 8581 // Otherwise, if all widening options failed, Instruction is to be 8582 // replicated. This may create a successor for VPBB. 8583 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8584 Instr, Range, VPBB, PredInst2Recipe, Plan); 8585 if (NextVPBB != VPBB) { 8586 VPBB = NextVPBB; 8587 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8588 : ""); 8589 } 8590 } 8591 } 8592 8593 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8594 // may also be empty, such as the last one VPBB, reflecting original 8595 // basic-blocks with no recipes. 8596 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8597 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8598 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8599 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8600 delete PreEntry; 8601 8602 // --------------------------------------------------------------------------- 8603 // Transform initial VPlan: Apply previously taken decisions, in order, to 8604 // bring the VPlan to its final state. 8605 // --------------------------------------------------------------------------- 8606 8607 // Apply Sink-After legal constraints. 8608 for (auto &Entry : SinkAfter) { 8609 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8610 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8611 // If the target is in a replication region, make sure to move Sink to the 8612 // block after it, not into the replication region itself. 8613 if (auto *Region = 8614 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8615 if (Region->isReplicator()) { 8616 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8617 VPBasicBlock *NextBlock = 8618 cast<VPBasicBlock>(Region->getSuccessors().front()); 8619 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8620 continue; 8621 } 8622 } 8623 Sink->moveAfter(Target); 8624 } 8625 8626 // Interleave memory: for each Interleave Group we marked earlier as relevant 8627 // for this VPlan, replace the Recipes widening its memory instructions with a 8628 // single VPInterleaveRecipe at its insertion point. 8629 for (auto IG : InterleaveGroups) { 8630 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8631 RecipeBuilder.getRecipe(IG->getInsertPos())); 8632 SmallVector<VPValue *, 4> StoredValues; 8633 for (unsigned i = 0; i < IG->getFactor(); ++i) 8634 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8635 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8636 8637 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8638 Recipe->getMask()); 8639 VPIG->insertBefore(Recipe); 8640 unsigned J = 0; 8641 for (unsigned i = 0; i < IG->getFactor(); ++i) 8642 if (Instruction *Member = IG->getMember(i)) { 8643 if (!Member->getType()->isVoidTy()) { 8644 VPValue *OriginalV = Plan->getVPValue(Member); 8645 Plan->removeVPValueFor(Member); 8646 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8647 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8648 J++; 8649 } 8650 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8651 } 8652 } 8653 8654 // Adjust the recipes for any inloop reductions. 8655 if (Range.Start.isVector()) 8656 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8657 8658 // Finally, if tail is folded by masking, introduce selects between the phi 8659 // and the live-out instruction of each reduction, at the end of the latch. 8660 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8661 Builder.setInsertPoint(VPBB); 8662 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8663 for (auto &Reduction : Legal->getReductionVars()) { 8664 if (CM.isInLoopReduction(Reduction.first)) 8665 continue; 8666 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8667 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8668 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8669 } 8670 } 8671 8672 std::string PlanName; 8673 raw_string_ostream RSO(PlanName); 8674 ElementCount VF = Range.Start; 8675 Plan->addVF(VF); 8676 RSO << "Initial VPlan for VF={" << VF; 8677 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8678 Plan->addVF(VF); 8679 RSO << "," << VF; 8680 } 8681 RSO << "},UF>=1"; 8682 RSO.flush(); 8683 Plan->setName(PlanName); 8684 8685 return Plan; 8686 } 8687 8688 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8689 // Outer loop handling: They may require CFG and instruction level 8690 // transformations before even evaluating whether vectorization is profitable. 8691 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8692 // the vectorization pipeline. 8693 assert(!OrigLoop->isInnermost()); 8694 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8695 8696 // Create new empty VPlan 8697 auto Plan = std::make_unique<VPlan>(); 8698 8699 // Build hierarchical CFG 8700 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8701 HCFGBuilder.buildHierarchicalCFG(); 8702 8703 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8704 VF *= 2) 8705 Plan->addVF(VF); 8706 8707 if (EnableVPlanPredication) { 8708 VPlanPredicator VPP(*Plan); 8709 VPP.predicate(); 8710 8711 // Avoid running transformation to recipes until masked code generation in 8712 // VPlan-native path is in place. 8713 return Plan; 8714 } 8715 8716 SmallPtrSet<Instruction *, 1> DeadInstructions; 8717 VPlanTransforms::VPInstructionsToVPRecipes( 8718 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8719 return Plan; 8720 } 8721 8722 // Adjust the recipes for any inloop reductions. The chain of instructions 8723 // leading from the loop exit instr to the phi need to be converted to 8724 // reductions, with one operand being vector and the other being the scalar 8725 // reduction chain. 8726 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8727 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8728 for (auto &Reduction : CM.getInLoopReductionChains()) { 8729 PHINode *Phi = Reduction.first; 8730 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8731 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8732 8733 // ReductionOperations are orders top-down from the phi's use to the 8734 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8735 // which of the two operands will remain scalar and which will be reduced. 8736 // For minmax the chain will be the select instructions. 8737 Instruction *Chain = Phi; 8738 for (Instruction *R : ReductionOperations) { 8739 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8740 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8741 8742 VPValue *ChainOp = Plan->getVPValue(Chain); 8743 unsigned FirstOpId; 8744 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8745 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8746 "Expected to replace a VPWidenSelectSC"); 8747 FirstOpId = 1; 8748 } else { 8749 assert(isa<VPWidenRecipe>(WidenRecipe) && 8750 "Expected to replace a VPWidenSC"); 8751 FirstOpId = 0; 8752 } 8753 unsigned VecOpId = 8754 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8755 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8756 8757 auto *CondOp = CM.foldTailByMasking() 8758 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8759 : nullptr; 8760 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8761 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8762 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8763 Plan->removeVPValueFor(R); 8764 Plan->addVPValue(R, RedRecipe); 8765 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8766 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8767 WidenRecipe->eraseFromParent(); 8768 8769 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8770 VPRecipeBase *CompareRecipe = 8771 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8772 assert(isa<VPWidenRecipe>(CompareRecipe) && 8773 "Expected to replace a VPWidenSC"); 8774 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8775 "Expected no remaining users"); 8776 CompareRecipe->eraseFromParent(); 8777 } 8778 Chain = R; 8779 } 8780 } 8781 } 8782 8783 Value* LoopVectorizationPlanner::VPCallbackILV:: 8784 getOrCreateVectorValues(Value *V, unsigned Part) { 8785 return ILV.getOrCreateVectorValue(V, Part); 8786 } 8787 8788 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8789 Value *V, const VPIteration &Instance) { 8790 return ILV.getOrCreateScalarValue(V, Instance); 8791 } 8792 8793 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8794 VPSlotTracker &SlotTracker) const { 8795 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8796 IG->getInsertPos()->printAsOperand(O, false); 8797 O << ", "; 8798 getAddr()->printAsOperand(O, SlotTracker); 8799 VPValue *Mask = getMask(); 8800 if (Mask) { 8801 O << ", "; 8802 Mask->printAsOperand(O, SlotTracker); 8803 } 8804 for (unsigned i = 0; i < IG->getFactor(); ++i) 8805 if (Instruction *I = IG->getMember(i)) 8806 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8807 } 8808 8809 void VPWidenCallRecipe::execute(VPTransformState &State) { 8810 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8811 *this, State); 8812 } 8813 8814 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8815 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8816 this, *this, InvariantCond, State); 8817 } 8818 8819 void VPWidenRecipe::execute(VPTransformState &State) { 8820 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8821 } 8822 8823 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8824 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8825 *this, State.UF, State.VF, IsPtrLoopInvariant, 8826 IsIndexLoopInvariant, State); 8827 } 8828 8829 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8830 assert(!State.Instance && "Int or FP induction being replicated."); 8831 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8832 Trunc); 8833 } 8834 8835 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8836 Value *StartV = 8837 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8838 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8839 } 8840 8841 void VPBlendRecipe::execute(VPTransformState &State) { 8842 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8843 // We know that all PHIs in non-header blocks are converted into 8844 // selects, so we don't have to worry about the insertion order and we 8845 // can just use the builder. 8846 // At this point we generate the predication tree. There may be 8847 // duplications since this is a simple recursive scan, but future 8848 // optimizations will clean it up. 8849 8850 unsigned NumIncoming = getNumIncomingValues(); 8851 8852 // Generate a sequence of selects of the form: 8853 // SELECT(Mask3, In3, 8854 // SELECT(Mask2, In2, 8855 // SELECT(Mask1, In1, 8856 // In0))) 8857 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8858 // are essentially undef are taken from In0. 8859 InnerLoopVectorizer::VectorParts Entry(State.UF); 8860 for (unsigned In = 0; In < NumIncoming; ++In) { 8861 for (unsigned Part = 0; Part < State.UF; ++Part) { 8862 // We might have single edge PHIs (blocks) - use an identity 8863 // 'select' for the first PHI operand. 8864 Value *In0 = State.get(getIncomingValue(In), Part); 8865 if (In == 0) 8866 Entry[Part] = In0; // Initialize with the first incoming value. 8867 else { 8868 // Select between the current value and the previous incoming edge 8869 // based on the incoming mask. 8870 Value *Cond = State.get(getMask(In), Part); 8871 Entry[Part] = 8872 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8873 } 8874 } 8875 } 8876 for (unsigned Part = 0; Part < State.UF; ++Part) 8877 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8878 } 8879 8880 void VPInterleaveRecipe::execute(VPTransformState &State) { 8881 assert(!State.Instance && "Interleave group being replicated."); 8882 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8883 getStoredValues(), getMask()); 8884 } 8885 8886 void VPReductionRecipe::execute(VPTransformState &State) { 8887 assert(!State.Instance && "Reduction being replicated."); 8888 for (unsigned Part = 0; Part < State.UF; ++Part) { 8889 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8890 Value *NewVecOp = State.get(getVecOp(), Part); 8891 if (VPValue *Cond = getCondOp()) { 8892 Value *NewCond = State.get(Cond, Part); 8893 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8894 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8895 Kind, VecTy->getElementType()); 8896 Constant *IdenVec = 8897 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8898 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8899 NewVecOp = Select; 8900 } 8901 Value *NewRed = 8902 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8903 Value *PrevInChain = State.get(getChainOp(), Part); 8904 Value *NextInChain; 8905 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8906 NextInChain = 8907 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8908 NewRed, PrevInChain); 8909 } else { 8910 NextInChain = State.Builder.CreateBinOp( 8911 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8912 PrevInChain); 8913 } 8914 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8915 } 8916 } 8917 8918 void VPReplicateRecipe::execute(VPTransformState &State) { 8919 if (State.Instance) { // Generate a single instance. 8920 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8921 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8922 *State.Instance, IsPredicated, State); 8923 // Insert scalar instance packing it into a vector. 8924 if (AlsoPack && State.VF.isVector()) { 8925 // If we're constructing lane 0, initialize to start from poison. 8926 if (State.Instance->Lane == 0) { 8927 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8928 Value *Poison = PoisonValue::get( 8929 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8930 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8931 State.Instance->Part, Poison); 8932 } 8933 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8934 *State.Instance); 8935 } 8936 return; 8937 } 8938 8939 // Generate scalar instances for all VF lanes of all UF parts, unless the 8940 // instruction is uniform inwhich case generate only the first lane for each 8941 // of the UF parts. 8942 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8943 assert((!State.VF.isScalable() || IsUniform) && 8944 "Can't scalarize a scalable vector"); 8945 for (unsigned Part = 0; Part < State.UF; ++Part) 8946 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8947 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8948 IsPredicated, State); 8949 } 8950 8951 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8952 assert(State.Instance && "Branch on Mask works only on single instance."); 8953 8954 unsigned Part = State.Instance->Part; 8955 unsigned Lane = State.Instance->Lane; 8956 8957 Value *ConditionBit = nullptr; 8958 VPValue *BlockInMask = getMask(); 8959 if (BlockInMask) { 8960 ConditionBit = State.get(BlockInMask, Part); 8961 if (ConditionBit->getType()->isVectorTy()) 8962 ConditionBit = State.Builder.CreateExtractElement( 8963 ConditionBit, State.Builder.getInt32(Lane)); 8964 } else // Block in mask is all-one. 8965 ConditionBit = State.Builder.getTrue(); 8966 8967 // Replace the temporary unreachable terminator with a new conditional branch, 8968 // whose two destinations will be set later when they are created. 8969 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8970 assert(isa<UnreachableInst>(CurrentTerminator) && 8971 "Expected to replace unreachable terminator with conditional branch."); 8972 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8973 CondBr->setSuccessor(0, nullptr); 8974 ReplaceInstWithInst(CurrentTerminator, CondBr); 8975 } 8976 8977 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8978 assert(State.Instance && "Predicated instruction PHI works per instance."); 8979 Instruction *ScalarPredInst = 8980 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8981 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8982 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8983 assert(PredicatingBB && "Predicated block has no single predecessor."); 8984 8985 // By current pack/unpack logic we need to generate only a single phi node: if 8986 // a vector value for the predicated instruction exists at this point it means 8987 // the instruction has vector users only, and a phi for the vector value is 8988 // needed. In this case the recipe of the predicated instruction is marked to 8989 // also do that packing, thereby "hoisting" the insert-element sequence. 8990 // Otherwise, a phi node for the scalar value is needed. 8991 unsigned Part = State.Instance->Part; 8992 Instruction *PredInst = 8993 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8994 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8995 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8996 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8997 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8998 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8999 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9000 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 9001 } else { 9002 Type *PredInstType = PredInst->getType(); 9003 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9004 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 9005 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9006 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 9007 } 9008 } 9009 9010 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9011 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9012 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9013 StoredValue ? nullptr : getVPValue(), 9014 getAddr(), StoredValue, getMask()); 9015 } 9016 9017 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9018 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9019 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9020 // for predication. 9021 static ScalarEpilogueLowering getScalarEpilogueLowering( 9022 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9023 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9024 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9025 LoopVectorizationLegality &LVL) { 9026 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9027 // don't look at hints or options, and don't request a scalar epilogue. 9028 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9029 // LoopAccessInfo (due to code dependency and not being able to reliably get 9030 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9031 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9032 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9033 // back to the old way and vectorize with versioning when forced. See D81345.) 9034 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9035 PGSOQueryType::IRPass) && 9036 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9037 return CM_ScalarEpilogueNotAllowedOptSize; 9038 9039 // 2) If set, obey the directives 9040 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9041 switch (PreferPredicateOverEpilogue) { 9042 case PreferPredicateTy::ScalarEpilogue: 9043 return CM_ScalarEpilogueAllowed; 9044 case PreferPredicateTy::PredicateElseScalarEpilogue: 9045 return CM_ScalarEpilogueNotNeededUsePredicate; 9046 case PreferPredicateTy::PredicateOrDontVectorize: 9047 return CM_ScalarEpilogueNotAllowedUsePredicate; 9048 }; 9049 } 9050 9051 // 3) If set, obey the hints 9052 switch (Hints.getPredicate()) { 9053 case LoopVectorizeHints::FK_Enabled: 9054 return CM_ScalarEpilogueNotNeededUsePredicate; 9055 case LoopVectorizeHints::FK_Disabled: 9056 return CM_ScalarEpilogueAllowed; 9057 }; 9058 9059 // 4) if the TTI hook indicates this is profitable, request predication. 9060 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9061 LVL.getLAI())) 9062 return CM_ScalarEpilogueNotNeededUsePredicate; 9063 9064 return CM_ScalarEpilogueAllowed; 9065 } 9066 9067 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9068 unsigned Part) { 9069 set(Def, V, Part); 9070 ILV->setVectorValue(IRDef, Part, V); 9071 } 9072 9073 // Process the loop in the VPlan-native vectorization path. This path builds 9074 // VPlan upfront in the vectorization pipeline, which allows to apply 9075 // VPlan-to-VPlan transformations from the very beginning without modifying the 9076 // input LLVM IR. 9077 static bool processLoopInVPlanNativePath( 9078 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9079 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9080 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9081 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9082 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9083 9084 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9085 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9086 return false; 9087 } 9088 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9089 Function *F = L->getHeader()->getParent(); 9090 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9091 9092 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9093 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9094 9095 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9096 &Hints, IAI); 9097 // Use the planner for outer loop vectorization. 9098 // TODO: CM is not used at this point inside the planner. Turn CM into an 9099 // optional argument if we don't need it in the future. 9100 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9101 9102 // Get user vectorization factor. 9103 ElementCount UserVF = Hints.getWidth(); 9104 9105 // Plan how to best vectorize, return the best VF and its cost. 9106 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9107 9108 // If we are stress testing VPlan builds, do not attempt to generate vector 9109 // code. Masked vector code generation support will follow soon. 9110 // Also, do not attempt to vectorize if no vector code will be produced. 9111 if (VPlanBuildStressTest || EnableVPlanPredication || 9112 VectorizationFactor::Disabled() == VF) 9113 return false; 9114 9115 LVP.setBestPlan(VF.Width, 1); 9116 9117 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9118 &CM, BFI, PSI); 9119 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9120 << L->getHeader()->getParent()->getName() << "\"\n"); 9121 LVP.executePlan(LB, DT); 9122 9123 // Mark the loop as already vectorized to avoid vectorizing again. 9124 Hints.setAlreadyVectorized(); 9125 9126 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9127 return true; 9128 } 9129 9130 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9131 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9132 !EnableLoopInterleaving), 9133 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9134 !EnableLoopVectorization) {} 9135 9136 bool LoopVectorizePass::processLoop(Loop *L) { 9137 assert((EnableVPlanNativePath || L->isInnermost()) && 9138 "VPlan-native path is not enabled. Only process inner loops."); 9139 9140 #ifndef NDEBUG 9141 const std::string DebugLocStr = getDebugLocString(L); 9142 #endif /* NDEBUG */ 9143 9144 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9145 << L->getHeader()->getParent()->getName() << "\" from " 9146 << DebugLocStr << "\n"); 9147 9148 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9149 9150 LLVM_DEBUG( 9151 dbgs() << "LV: Loop hints:" 9152 << " force=" 9153 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9154 ? "disabled" 9155 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9156 ? "enabled" 9157 : "?")) 9158 << " width=" << Hints.getWidth() 9159 << " unroll=" << Hints.getInterleave() << "\n"); 9160 9161 // Function containing loop 9162 Function *F = L->getHeader()->getParent(); 9163 9164 // Looking at the diagnostic output is the only way to determine if a loop 9165 // was vectorized (other than looking at the IR or machine code), so it 9166 // is important to generate an optimization remark for each loop. Most of 9167 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9168 // generated as OptimizationRemark and OptimizationRemarkMissed are 9169 // less verbose reporting vectorized loops and unvectorized loops that may 9170 // benefit from vectorization, respectively. 9171 9172 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9173 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9174 return false; 9175 } 9176 9177 PredicatedScalarEvolution PSE(*SE, *L); 9178 9179 // Check if it is legal to vectorize the loop. 9180 LoopVectorizationRequirements Requirements(*ORE); 9181 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9182 &Requirements, &Hints, DB, AC, BFI, PSI); 9183 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9184 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9185 Hints.emitRemarkWithHints(); 9186 return false; 9187 } 9188 9189 // Check the function attributes and profiles to find out if this function 9190 // should be optimized for size. 9191 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9192 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9193 9194 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9195 // here. They may require CFG and instruction level transformations before 9196 // even evaluating whether vectorization is profitable. Since we cannot modify 9197 // the incoming IR, we need to build VPlan upfront in the vectorization 9198 // pipeline. 9199 if (!L->isInnermost()) 9200 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9201 ORE, BFI, PSI, Hints); 9202 9203 assert(L->isInnermost() && "Inner loop expected."); 9204 9205 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9206 // count by optimizing for size, to minimize overheads. 9207 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9208 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9209 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9210 << "This loop is worth vectorizing only if no scalar " 9211 << "iteration overheads are incurred."); 9212 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9213 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9214 else { 9215 LLVM_DEBUG(dbgs() << "\n"); 9216 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9217 } 9218 } 9219 9220 // Check the function attributes to see if implicit floats are allowed. 9221 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9222 // an integer loop and the vector instructions selected are purely integer 9223 // vector instructions? 9224 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9225 reportVectorizationFailure( 9226 "Can't vectorize when the NoImplicitFloat attribute is used", 9227 "loop not vectorized due to NoImplicitFloat attribute", 9228 "NoImplicitFloat", ORE, L); 9229 Hints.emitRemarkWithHints(); 9230 return false; 9231 } 9232 9233 // Check if the target supports potentially unsafe FP vectorization. 9234 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9235 // for the target we're vectorizing for, to make sure none of the 9236 // additional fp-math flags can help. 9237 if (Hints.isPotentiallyUnsafe() && 9238 TTI->isFPVectorizationPotentiallyUnsafe()) { 9239 reportVectorizationFailure( 9240 "Potentially unsafe FP op prevents vectorization", 9241 "loop not vectorized due to unsafe FP support.", 9242 "UnsafeFP", ORE, L); 9243 Hints.emitRemarkWithHints(); 9244 return false; 9245 } 9246 9247 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9248 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9249 9250 // If an override option has been passed in for interleaved accesses, use it. 9251 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9252 UseInterleaved = EnableInterleavedMemAccesses; 9253 9254 // Analyze interleaved memory accesses. 9255 if (UseInterleaved) { 9256 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9257 } 9258 9259 // Use the cost model. 9260 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9261 F, &Hints, IAI); 9262 CM.collectValuesToIgnore(); 9263 9264 // Use the planner for vectorization. 9265 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9266 9267 // Get user vectorization factor and interleave count. 9268 ElementCount UserVF = Hints.getWidth(); 9269 unsigned UserIC = Hints.getInterleave(); 9270 9271 // Plan how to best vectorize, return the best VF and its cost. 9272 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9273 9274 VectorizationFactor VF = VectorizationFactor::Disabled(); 9275 unsigned IC = 1; 9276 9277 if (MaybeVF) { 9278 VF = *MaybeVF; 9279 // Select the interleave count. 9280 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9281 } 9282 9283 // Identify the diagnostic messages that should be produced. 9284 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9285 bool VectorizeLoop = true, InterleaveLoop = true; 9286 if (Requirements.doesNotMeet(F, L, Hints)) { 9287 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9288 "requirements.\n"); 9289 Hints.emitRemarkWithHints(); 9290 return false; 9291 } 9292 9293 if (VF.Width.isScalar()) { 9294 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9295 VecDiagMsg = std::make_pair( 9296 "VectorizationNotBeneficial", 9297 "the cost-model indicates that vectorization is not beneficial"); 9298 VectorizeLoop = false; 9299 } 9300 9301 if (!MaybeVF && UserIC > 1) { 9302 // Tell the user interleaving was avoided up-front, despite being explicitly 9303 // requested. 9304 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9305 "interleaving should be avoided up front\n"); 9306 IntDiagMsg = std::make_pair( 9307 "InterleavingAvoided", 9308 "Ignoring UserIC, because interleaving was avoided up front"); 9309 InterleaveLoop = false; 9310 } else if (IC == 1 && UserIC <= 1) { 9311 // Tell the user interleaving is not beneficial. 9312 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9313 IntDiagMsg = std::make_pair( 9314 "InterleavingNotBeneficial", 9315 "the cost-model indicates that interleaving is not beneficial"); 9316 InterleaveLoop = false; 9317 if (UserIC == 1) { 9318 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9319 IntDiagMsg.second += 9320 " and is explicitly disabled or interleave count is set to 1"; 9321 } 9322 } else if (IC > 1 && UserIC == 1) { 9323 // Tell the user interleaving is beneficial, but it explicitly disabled. 9324 LLVM_DEBUG( 9325 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9326 IntDiagMsg = std::make_pair( 9327 "InterleavingBeneficialButDisabled", 9328 "the cost-model indicates that interleaving is beneficial " 9329 "but is explicitly disabled or interleave count is set to 1"); 9330 InterleaveLoop = false; 9331 } 9332 9333 // Override IC if user provided an interleave count. 9334 IC = UserIC > 0 ? UserIC : IC; 9335 9336 // Emit diagnostic messages, if any. 9337 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9338 if (!VectorizeLoop && !InterleaveLoop) { 9339 // Do not vectorize or interleaving the loop. 9340 ORE->emit([&]() { 9341 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9342 L->getStartLoc(), L->getHeader()) 9343 << VecDiagMsg.second; 9344 }); 9345 ORE->emit([&]() { 9346 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9347 L->getStartLoc(), L->getHeader()) 9348 << IntDiagMsg.second; 9349 }); 9350 return false; 9351 } else if (!VectorizeLoop && InterleaveLoop) { 9352 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9353 ORE->emit([&]() { 9354 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9355 L->getStartLoc(), L->getHeader()) 9356 << VecDiagMsg.second; 9357 }); 9358 } else if (VectorizeLoop && !InterleaveLoop) { 9359 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9360 << ") in " << DebugLocStr << '\n'); 9361 ORE->emit([&]() { 9362 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9363 L->getStartLoc(), L->getHeader()) 9364 << IntDiagMsg.second; 9365 }); 9366 } else if (VectorizeLoop && InterleaveLoop) { 9367 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9368 << ") in " << DebugLocStr << '\n'); 9369 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9370 } 9371 9372 LVP.setBestPlan(VF.Width, IC); 9373 9374 using namespace ore; 9375 bool DisableRuntimeUnroll = false; 9376 MDNode *OrigLoopID = L->getLoopID(); 9377 9378 if (!VectorizeLoop) { 9379 assert(IC > 1 && "interleave count should not be 1 or 0"); 9380 // If we decided that it is not legal to vectorize the loop, then 9381 // interleave it. 9382 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9383 BFI, PSI); 9384 LVP.executePlan(Unroller, DT); 9385 9386 ORE->emit([&]() { 9387 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9388 L->getHeader()) 9389 << "interleaved loop (interleaved count: " 9390 << NV("InterleaveCount", IC) << ")"; 9391 }); 9392 } else { 9393 // If we decided that it is *legal* to vectorize the loop, then do it. 9394 9395 // Consider vectorizing the epilogue too if it's profitable. 9396 VectorizationFactor EpilogueVF = 9397 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9398 if (EpilogueVF.Width.isVector()) { 9399 9400 // The first pass vectorizes the main loop and creates a scalar epilogue 9401 // to be vectorized by executing the plan (potentially with a different 9402 // factor) again shortly afterwards. 9403 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9404 EpilogueVF.Width.getKnownMinValue(), 1); 9405 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9406 &LVL, &CM, BFI, PSI); 9407 9408 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9409 LVP.executePlan(MainILV, DT); 9410 ++LoopsVectorized; 9411 9412 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9413 formLCSSARecursively(*L, *DT, LI, SE); 9414 9415 // Second pass vectorizes the epilogue and adjusts the control flow 9416 // edges from the first pass. 9417 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9418 EPI.MainLoopVF = EPI.EpilogueVF; 9419 EPI.MainLoopUF = EPI.EpilogueUF; 9420 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9421 ORE, EPI, &LVL, &CM, BFI, PSI); 9422 LVP.executePlan(EpilogILV, DT); 9423 ++LoopsEpilogueVectorized; 9424 9425 if (!MainILV.areSafetyChecksAdded()) 9426 DisableRuntimeUnroll = true; 9427 } else { 9428 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9429 &LVL, &CM, BFI, PSI); 9430 LVP.executePlan(LB, DT); 9431 ++LoopsVectorized; 9432 9433 // Add metadata to disable runtime unrolling a scalar loop when there are 9434 // no runtime checks about strides and memory. A scalar loop that is 9435 // rarely used is not worth unrolling. 9436 if (!LB.areSafetyChecksAdded()) 9437 DisableRuntimeUnroll = true; 9438 } 9439 9440 // Report the vectorization decision. 9441 ORE->emit([&]() { 9442 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9443 L->getHeader()) 9444 << "vectorized loop (vectorization width: " 9445 << NV("VectorizationFactor", VF.Width) 9446 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9447 }); 9448 } 9449 9450 Optional<MDNode *> RemainderLoopID = 9451 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9452 LLVMLoopVectorizeFollowupEpilogue}); 9453 if (RemainderLoopID.hasValue()) { 9454 L->setLoopID(RemainderLoopID.getValue()); 9455 } else { 9456 if (DisableRuntimeUnroll) 9457 AddRuntimeUnrollDisableMetaData(L); 9458 9459 // Mark the loop as already vectorized to avoid vectorizing again. 9460 Hints.setAlreadyVectorized(); 9461 } 9462 9463 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9464 return true; 9465 } 9466 9467 LoopVectorizeResult LoopVectorizePass::runImpl( 9468 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9469 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9470 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9471 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9472 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9473 SE = &SE_; 9474 LI = &LI_; 9475 TTI = &TTI_; 9476 DT = &DT_; 9477 BFI = &BFI_; 9478 TLI = TLI_; 9479 AA = &AA_; 9480 AC = &AC_; 9481 GetLAA = &GetLAA_; 9482 DB = &DB_; 9483 ORE = &ORE_; 9484 PSI = PSI_; 9485 9486 // Don't attempt if 9487 // 1. the target claims to have no vector registers, and 9488 // 2. interleaving won't help ILP. 9489 // 9490 // The second condition is necessary because, even if the target has no 9491 // vector registers, loop vectorization may still enable scalar 9492 // interleaving. 9493 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9494 TTI->getMaxInterleaveFactor(1) < 2) 9495 return LoopVectorizeResult(false, false); 9496 9497 bool Changed = false, CFGChanged = false; 9498 9499 // The vectorizer requires loops to be in simplified form. 9500 // Since simplification may add new inner loops, it has to run before the 9501 // legality and profitability checks. This means running the loop vectorizer 9502 // will simplify all loops, regardless of whether anything end up being 9503 // vectorized. 9504 for (auto &L : *LI) 9505 Changed |= CFGChanged |= 9506 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9507 9508 // Build up a worklist of inner-loops to vectorize. This is necessary as 9509 // the act of vectorizing or partially unrolling a loop creates new loops 9510 // and can invalidate iterators across the loops. 9511 SmallVector<Loop *, 8> Worklist; 9512 9513 for (Loop *L : *LI) 9514 collectSupportedLoops(*L, LI, ORE, Worklist); 9515 9516 LoopsAnalyzed += Worklist.size(); 9517 9518 // Now walk the identified inner loops. 9519 while (!Worklist.empty()) { 9520 Loop *L = Worklist.pop_back_val(); 9521 9522 // For the inner loops we actually process, form LCSSA to simplify the 9523 // transform. 9524 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9525 9526 Changed |= CFGChanged |= processLoop(L); 9527 } 9528 9529 // Process each loop nest in the function. 9530 return LoopVectorizeResult(Changed, CFGChanged); 9531 } 9532 9533 PreservedAnalyses LoopVectorizePass::run(Function &F, 9534 FunctionAnalysisManager &AM) { 9535 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9536 auto &LI = AM.getResult<LoopAnalysis>(F); 9537 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9538 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9539 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9540 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9541 auto &AA = AM.getResult<AAManager>(F); 9542 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9543 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9544 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9545 MemorySSA *MSSA = EnableMSSALoopDependency 9546 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9547 : nullptr; 9548 9549 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9550 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9551 [&](Loop &L) -> const LoopAccessInfo & { 9552 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9553 TLI, TTI, nullptr, MSSA}; 9554 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9555 }; 9556 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9557 ProfileSummaryInfo *PSI = 9558 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9559 LoopVectorizeResult Result = 9560 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9561 if (!Result.MadeAnyChange) 9562 return PreservedAnalyses::all(); 9563 PreservedAnalyses PA; 9564 9565 // We currently do not preserve loopinfo/dominator analyses with outer loop 9566 // vectorization. Until this is addressed, mark these analyses as preserved 9567 // only for non-VPlan-native path. 9568 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9569 if (!EnableVPlanNativePath) { 9570 PA.preserve<LoopAnalysis>(); 9571 PA.preserve<DominatorTreeAnalysis>(); 9572 } 9573 PA.preserve<BasicAA>(); 9574 PA.preserve<GlobalsAA>(); 9575 if (!Result.MadeCFGChange) 9576 PA.preserveSet<CFGAnalyses>(); 9577 return PA; 9578 } 9579