1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(false), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 assert(!VF.isScalable() && "scalable vectors not yet supported."); 371 // Determine if an array of VF elements of type Ty is "bitcast compatible" 372 // with a <VF x Ty> vector. 373 if (VF.isVector()) { 374 auto *VectorTy = VectorType::get(Ty, VF); 375 return TypeSize::get(VF.getKnownMinValue() * 376 DL.getTypeAllocSize(Ty).getFixedValue(), 377 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 378 } 379 380 // If the vectorization factor is one, we just check if an array of type Ty 381 // requires padding between elements. 382 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 383 } 384 385 /// A helper function that returns the reciprocal of the block probability of 386 /// predicated blocks. If we return X, we are assuming the predicated block 387 /// will execute once for every X iterations of the loop header. 388 /// 389 /// TODO: We should use actual block probability here, if available. Currently, 390 /// we always assume predicated blocks have a 50% chance of executing. 391 static unsigned getReciprocalPredBlockProb() { return 2; } 392 393 /// A helper function that adds a 'fast' flag to floating-point operations. 394 static Value *addFastMathFlag(Value *V) { 395 if (isa<FPMathOperator>(V)) 396 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 397 return V; 398 } 399 400 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FMF); 403 return V; 404 } 405 406 /// A helper function that returns an integer or floating-point constant with 407 /// value C. 408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 409 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 410 : ConstantFP::get(Ty, C); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if it is known. 418 /// 4) Returns None if all of the above failed. 419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 420 // Check if exact trip count is known. 421 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 422 return ExpectedTC; 423 424 // Check if there is an expected trip count available from profile data. 425 if (LoopVectorizeWithBlockFrequency) 426 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 427 return EstimatedTC; 428 429 // Check if upper bound estimate is known. 430 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 431 return ExpectedTC; 432 433 return None; 434 } 435 436 namespace llvm { 437 438 /// InnerLoopVectorizer vectorizes loops which contain only one basic 439 /// block to a specified vectorization factor (VF). 440 /// This class performs the widening of scalars into vectors, or multiple 441 /// scalars. This class also implements the following features: 442 /// * It inserts an epilogue loop for handling loops that don't have iteration 443 /// counts that are known to be a multiple of the vectorization factor. 444 /// * It handles the code generation for reduction variables. 445 /// * Scalarization (implementation using scalars) of un-vectorizable 446 /// instructions. 447 /// InnerLoopVectorizer does not perform any vectorization-legality 448 /// checks, and relies on the caller to check for the different legality 449 /// aspects. The InnerLoopVectorizer relies on the 450 /// LoopVectorizationLegality class to provide information about the induction 451 /// and reduction variables that were found to a given vectorization factor. 452 class InnerLoopVectorizer { 453 public: 454 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 455 LoopInfo *LI, DominatorTree *DT, 456 const TargetLibraryInfo *TLI, 457 const TargetTransformInfo *TTI, AssumptionCache *AC, 458 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 459 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 460 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 461 ProfileSummaryInfo *PSI) 462 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 463 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 464 Builder(PSE.getSE()->getContext()), 465 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 466 BFI(BFI), PSI(PSI) { 467 // Query this against the original loop and save it here because the profile 468 // of the original loop header may change as the transformation happens. 469 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 470 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 471 } 472 473 virtual ~InnerLoopVectorizer() = default; 474 475 /// Create a new empty loop that will contain vectorized instructions later 476 /// on, while the old loop will be used as the scalar remainder. Control flow 477 /// is generated around the vectorized (and scalar epilogue) loops consisting 478 /// of various checks and bypasses. Return the pre-header block of the new 479 /// loop. 480 /// In the case of epilogue vectorization, this function is overriden to 481 /// handle the more complex control flow around the loops. 482 virtual BasicBlock *createVectorizedLoopSkeleton(); 483 484 /// Widen a single instruction within the innermost loop. 485 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 486 VPTransformState &State); 487 488 /// Widen a single call instruction within the innermost loop. 489 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 490 VPTransformState &State); 491 492 /// Widen a single select instruction within the innermost loop. 493 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 494 bool InvariantCond, VPTransformState &State); 495 496 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 497 void fixVectorizedLoop(); 498 499 // Return true if any runtime check is added. 500 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 501 502 /// A type for vectorized values in the new loop. Each value from the 503 /// original loop, when vectorized, is represented by UF vector values in the 504 /// new unrolled loop, where UF is the unroll factor. 505 using VectorParts = SmallVector<Value *, 2>; 506 507 /// Vectorize a single GetElementPtrInst based on information gathered and 508 /// decisions taken during planning. 509 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 510 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 511 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 512 513 /// Vectorize a single PHINode in a block. This method handles the induction 514 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 515 /// arbitrary length vectors. 516 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 517 518 /// A helper function to scalarize a single Instruction in the innermost loop. 519 /// Generates a sequence of scalar instances for each lane between \p MinLane 520 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 521 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 522 /// Instr's operands. 523 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 524 const VPIteration &Instance, bool IfPredicateInstr, 525 VPTransformState &State); 526 527 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 528 /// is provided, the integer induction variable will first be truncated to 529 /// the corresponding type. 530 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 531 532 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 533 /// vector or scalar value on-demand if one is not yet available. When 534 /// vectorizing a loop, we visit the definition of an instruction before its 535 /// uses. When visiting the definition, we either vectorize or scalarize the 536 /// instruction, creating an entry for it in the corresponding map. (In some 537 /// cases, such as induction variables, we will create both vector and scalar 538 /// entries.) Then, as we encounter uses of the definition, we derive values 539 /// for each scalar or vector use unless such a value is already available. 540 /// For example, if we scalarize a definition and one of its uses is vector, 541 /// we build the required vector on-demand with an insertelement sequence 542 /// when visiting the use. Otherwise, if the use is scalar, we can use the 543 /// existing scalar definition. 544 /// 545 /// Return a value in the new loop corresponding to \p V from the original 546 /// loop at unroll index \p Part. If the value has already been vectorized, 547 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 548 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 549 /// a new vector value on-demand by inserting the scalar values into a vector 550 /// with an insertelement sequence. If the value has been neither vectorized 551 /// nor scalarized, it must be loop invariant, so we simply broadcast the 552 /// value into a vector. 553 Value *getOrCreateVectorValue(Value *V, unsigned Part); 554 555 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 556 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 557 } 558 559 /// Return a value in the new loop corresponding to \p V from the original 560 /// loop at unroll and vector indices \p Instance. If the value has been 561 /// vectorized but not scalarized, the necessary extractelement instruction 562 /// will be generated. 563 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 564 565 /// Construct the vector value of a scalarized value \p V one lane at a time. 566 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 567 568 /// Try to vectorize interleaved access group \p Group with the base address 569 /// given in \p Addr, optionally masking the vector operations if \p 570 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 571 /// values in the vectorized loop. 572 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 573 VPTransformState &State, VPValue *Addr, 574 ArrayRef<VPValue *> StoredValues, 575 VPValue *BlockInMask = nullptr); 576 577 /// Vectorize Load and Store instructions with the base address given in \p 578 /// Addr, optionally masking the vector operations if \p BlockInMask is 579 /// non-null. Use \p State to translate given VPValues to IR values in the 580 /// vectorized loop. 581 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 582 VPValue *Def, VPValue *Addr, 583 VPValue *StoredValue, VPValue *BlockInMask); 584 585 /// Set the debug location in the builder using the debug location in 586 /// the instruction. 587 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 588 589 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 590 void fixNonInductionPHIs(void); 591 592 protected: 593 friend class LoopVectorizationPlanner; 594 595 /// A small list of PHINodes. 596 using PhiVector = SmallVector<PHINode *, 4>; 597 598 /// A type for scalarized values in the new loop. Each value from the 599 /// original loop, when scalarized, is represented by UF x VF scalar values 600 /// in the new unrolled loop, where UF is the unroll factor and VF is the 601 /// vectorization factor. 602 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 603 604 /// Set up the values of the IVs correctly when exiting the vector loop. 605 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 606 Value *CountRoundDown, Value *EndValue, 607 BasicBlock *MiddleBlock); 608 609 /// Create a new induction variable inside L. 610 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 611 Value *Step, Instruction *DL); 612 613 /// Handle all cross-iteration phis in the header. 614 void fixCrossIterationPHIs(); 615 616 /// Fix a first-order recurrence. This is the second phase of vectorizing 617 /// this phi node. 618 void fixFirstOrderRecurrence(PHINode *Phi); 619 620 /// Fix a reduction cross-iteration phi. This is the second phase of 621 /// vectorizing this phi node. 622 void fixReduction(PHINode *Phi); 623 624 /// Clear NSW/NUW flags from reduction instructions if necessary. 625 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 626 627 /// The Loop exit block may have single value PHI nodes with some 628 /// incoming value. While vectorizing we only handled real values 629 /// that were defined inside the loop and we should have one value for 630 /// each predecessor of its parent basic block. See PR14725. 631 void fixLCSSAPHIs(); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(); 640 641 /// Create a broadcast instruction. This method generates a broadcast 642 /// instruction (shuffle) for loop invariant values and for the induction 643 /// value. If this is the induction variable then we extend it to N, N+1, ... 644 /// this is needed because each iteration in the loop corresponds to a SIMD 645 /// element. 646 virtual Value *getBroadcastInstrs(Value *V); 647 648 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 649 /// to each vector element of Val. The sequence starts at StartIndex. 650 /// \p Opcode is relevant for FP induction variable. 651 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 652 Instruction::BinaryOps Opcode = 653 Instruction::BinaryOpsEnd); 654 655 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 656 /// variable on which to base the steps, \p Step is the size of the step, and 657 /// \p EntryVal is the value from the original loop that maps to the steps. 658 /// Note that \p EntryVal doesn't have to be an induction variable - it 659 /// can also be a truncate instruction. 660 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 661 const InductionDescriptor &ID); 662 663 /// Create a vector induction phi node based on an existing scalar one. \p 664 /// EntryVal is the value from the original loop that maps to the vector phi 665 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 666 /// truncate instruction, instead of widening the original IV, we widen a 667 /// version of the IV truncated to \p EntryVal's type. 668 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 669 Value *Step, Instruction *EntryVal); 670 671 /// Returns true if an instruction \p I should be scalarized instead of 672 /// vectorized for the chosen vectorization factor. 673 bool shouldScalarizeInstruction(Instruction *I) const; 674 675 /// Returns true if we should generate a scalar version of \p IV. 676 bool needsScalarInduction(Instruction *IV) const; 677 678 /// If there is a cast involved in the induction variable \p ID, which should 679 /// be ignored in the vectorized loop body, this function records the 680 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 681 /// cast. We had already proved that the casted Phi is equal to the uncasted 682 /// Phi in the vectorized loop (under a runtime guard), and therefore 683 /// there is no need to vectorize the cast - the same value can be used in the 684 /// vector loop for both the Phi and the cast. 685 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 686 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 687 /// 688 /// \p EntryVal is the value from the original loop that maps to the vector 689 /// phi node and is used to distinguish what is the IV currently being 690 /// processed - original one (if \p EntryVal is a phi corresponding to the 691 /// original IV) or the "newly-created" one based on the proof mentioned above 692 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 693 /// latter case \p EntryVal is a TruncInst and we must not record anything for 694 /// that IV, but it's error-prone to expect callers of this routine to care 695 /// about that, hence this explicit parameter. 696 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 697 const Instruction *EntryVal, 698 Value *VectorLoopValue, 699 unsigned Part, 700 unsigned Lane = UINT_MAX); 701 702 /// Generate a shuffle sequence that will reverse the vector Vec. 703 virtual Value *reverseVector(Value *Vec); 704 705 /// Returns (and creates if needed) the original loop trip count. 706 Value *getOrCreateTripCount(Loop *NewLoop); 707 708 /// Returns (and creates if needed) the trip count of the widened loop. 709 Value *getOrCreateVectorTripCount(Loop *NewLoop); 710 711 /// Returns a bitcasted value to the requested vector type. 712 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 713 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 714 const DataLayout &DL); 715 716 /// Emit a bypass check to see if the vector trip count is zero, including if 717 /// it overflows. 718 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 719 720 /// Emit a bypass check to see if all of the SCEV assumptions we've 721 /// had to make are correct. 722 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 723 724 /// Emit bypass checks to check any memory assumptions we may have made. 725 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 726 727 /// Compute the transformed value of Index at offset StartValue using step 728 /// StepValue. 729 /// For integer induction, returns StartValue + Index * StepValue. 730 /// For pointer induction, returns StartValue[Index * StepValue]. 731 /// FIXME: The newly created binary instructions should contain nsw/nuw 732 /// flags, which can be found from the original scalar operations. 733 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 734 const DataLayout &DL, 735 const InductionDescriptor &ID) const; 736 737 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 738 /// vector loop preheader, middle block and scalar preheader. Also 739 /// allocate a loop object for the new vector loop and return it. 740 Loop *createVectorLoopSkeleton(StringRef Prefix); 741 742 /// Create new phi nodes for the induction variables to resume iteration count 743 /// in the scalar epilogue, from where the vectorized loop left off (given by 744 /// \p VectorTripCount). 745 /// In cases where the loop skeleton is more complicated (eg. epilogue 746 /// vectorization) and the resume values can come from an additional bypass 747 /// block, the \p AdditionalBypass pair provides information about the bypass 748 /// block and the end value on the edge from bypass to this loop. 749 void createInductionResumeValues( 750 Loop *L, Value *VectorTripCount, 751 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 752 753 /// Complete the loop skeleton by adding debug MDs, creating appropriate 754 /// conditional branches in the middle block, preparing the builder and 755 /// running the verifier. Take in the vector loop \p L as argument, and return 756 /// the preheader of the completed vector loop. 757 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 758 759 /// Add additional metadata to \p To that was not present on \p Orig. 760 /// 761 /// Currently this is used to add the noalias annotations based on the 762 /// inserted memchecks. Use this for instructions that are *cloned* into the 763 /// vector loop. 764 void addNewMetadata(Instruction *To, const Instruction *Orig); 765 766 /// Add metadata from one instruction to another. 767 /// 768 /// This includes both the original MDs from \p From and additional ones (\see 769 /// addNewMetadata). Use this for *newly created* instructions in the vector 770 /// loop. 771 void addMetadata(Instruction *To, Instruction *From); 772 773 /// Similar to the previous function but it adds the metadata to a 774 /// vector of instructions. 775 void addMetadata(ArrayRef<Value *> To, Instruction *From); 776 777 /// Allow subclasses to override and print debug traces before/after vplan 778 /// execution, when trace information is requested. 779 virtual void printDebugTracesAtStart(){}; 780 virtual void printDebugTracesAtEnd(){}; 781 782 /// The original loop. 783 Loop *OrigLoop; 784 785 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 786 /// dynamic knowledge to simplify SCEV expressions and converts them to a 787 /// more usable form. 788 PredicatedScalarEvolution &PSE; 789 790 /// Loop Info. 791 LoopInfo *LI; 792 793 /// Dominator Tree. 794 DominatorTree *DT; 795 796 /// Alias Analysis. 797 AAResults *AA; 798 799 /// Target Library Info. 800 const TargetLibraryInfo *TLI; 801 802 /// Target Transform Info. 803 const TargetTransformInfo *TTI; 804 805 /// Assumption Cache. 806 AssumptionCache *AC; 807 808 /// Interface to emit optimization remarks. 809 OptimizationRemarkEmitter *ORE; 810 811 /// LoopVersioning. It's only set up (non-null) if memchecks were 812 /// used. 813 /// 814 /// This is currently only used to add no-alias metadata based on the 815 /// memchecks. The actually versioning is performed manually. 816 std::unique_ptr<LoopVersioning> LVer; 817 818 /// The vectorization SIMD factor to use. Each vector will have this many 819 /// vector elements. 820 ElementCount VF; 821 822 /// The vectorization unroll factor to use. Each scalar is vectorized to this 823 /// many different vector instructions. 824 unsigned UF; 825 826 /// The builder that we use 827 IRBuilder<> Builder; 828 829 // --- Vectorization state --- 830 831 /// The vector-loop preheader. 832 BasicBlock *LoopVectorPreHeader; 833 834 /// The scalar-loop preheader. 835 BasicBlock *LoopScalarPreHeader; 836 837 /// Middle Block between the vector and the scalar. 838 BasicBlock *LoopMiddleBlock; 839 840 /// The ExitBlock of the scalar loop. 841 BasicBlock *LoopExitBlock; 842 843 /// The vector loop body. 844 BasicBlock *LoopVectorBody; 845 846 /// The scalar loop body. 847 BasicBlock *LoopScalarBody; 848 849 /// A list of all bypass blocks. The first block is the entry of the loop. 850 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 851 852 /// The new Induction variable which was added to the new block. 853 PHINode *Induction = nullptr; 854 855 /// The induction variable of the old basic block. 856 PHINode *OldInduction = nullptr; 857 858 /// Maps values from the original loop to their corresponding values in the 859 /// vectorized loop. A key value can map to either vector values, scalar 860 /// values or both kinds of values, depending on whether the key was 861 /// vectorized and scalarized. 862 VectorizerValueMap VectorLoopValueMap; 863 864 /// Store instructions that were predicated. 865 SmallVector<Instruction *, 4> PredicatedInstructions; 866 867 /// Trip count of the original loop. 868 Value *TripCount = nullptr; 869 870 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 871 Value *VectorTripCount = nullptr; 872 873 /// The legality analysis. 874 LoopVectorizationLegality *Legal; 875 876 /// The profitablity analysis. 877 LoopVectorizationCostModel *Cost; 878 879 // Record whether runtime checks are added. 880 bool AddedSafetyChecks = false; 881 882 // Holds the end values for each induction variable. We save the end values 883 // so we can later fix-up the external users of the induction variables. 884 DenseMap<PHINode *, Value *> IVEndValues; 885 886 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 887 // fixed up at the end of vector code generation. 888 SmallVector<PHINode *, 8> OrigPHIsToFix; 889 890 /// BFI and PSI are used to check for profile guided size optimizations. 891 BlockFrequencyInfo *BFI; 892 ProfileSummaryInfo *PSI; 893 894 // Whether this loop should be optimized for size based on profile guided size 895 // optimizatios. 896 bool OptForSizeBasedOnProfile; 897 }; 898 899 class InnerLoopUnroller : public InnerLoopVectorizer { 900 public: 901 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 902 LoopInfo *LI, DominatorTree *DT, 903 const TargetLibraryInfo *TLI, 904 const TargetTransformInfo *TTI, AssumptionCache *AC, 905 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 906 LoopVectorizationLegality *LVL, 907 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 908 ProfileSummaryInfo *PSI) 909 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 910 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 911 BFI, PSI) {} 912 913 private: 914 Value *getBroadcastInstrs(Value *V) override; 915 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 916 Instruction::BinaryOps Opcode = 917 Instruction::BinaryOpsEnd) override; 918 Value *reverseVector(Value *Vec) override; 919 }; 920 921 /// Encapsulate information regarding vectorization of a loop and its epilogue. 922 /// This information is meant to be updated and used across two stages of 923 /// epilogue vectorization. 924 struct EpilogueLoopVectorizationInfo { 925 ElementCount MainLoopVF = ElementCount::getFixed(0); 926 unsigned MainLoopUF = 0; 927 ElementCount EpilogueVF = ElementCount::getFixed(0); 928 unsigned EpilogueUF = 0; 929 BasicBlock *MainLoopIterationCountCheck = nullptr; 930 BasicBlock *EpilogueIterationCountCheck = nullptr; 931 BasicBlock *SCEVSafetyCheck = nullptr; 932 BasicBlock *MemSafetyCheck = nullptr; 933 Value *TripCount = nullptr; 934 Value *VectorTripCount = nullptr; 935 936 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 937 unsigned EUF) 938 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 939 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 940 assert(EUF == 1 && 941 "A high UF for the epilogue loop is likely not beneficial."); 942 } 943 }; 944 945 /// An extension of the inner loop vectorizer that creates a skeleton for a 946 /// vectorized loop that has its epilogue (residual) also vectorized. 947 /// The idea is to run the vplan on a given loop twice, firstly to setup the 948 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 949 /// from the first step and vectorize the epilogue. This is achieved by 950 /// deriving two concrete strategy classes from this base class and invoking 951 /// them in succession from the loop vectorizer planner. 952 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 953 public: 954 InnerLoopAndEpilogueVectorizer( 955 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 956 DominatorTree *DT, const TargetLibraryInfo *TLI, 957 const TargetTransformInfo *TTI, AssumptionCache *AC, 958 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 959 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 960 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 961 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 962 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 963 EPI(EPI) {} 964 965 // Override this function to handle the more complex control flow around the 966 // three loops. 967 BasicBlock *createVectorizedLoopSkeleton() final override { 968 return createEpilogueVectorizedLoopSkeleton(); 969 } 970 971 /// The interface for creating a vectorized skeleton using one of two 972 /// different strategies, each corresponding to one execution of the vplan 973 /// as described above. 974 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 975 976 /// Holds and updates state information required to vectorize the main loop 977 /// and its epilogue in two separate passes. This setup helps us avoid 978 /// regenerating and recomputing runtime safety checks. It also helps us to 979 /// shorten the iteration-count-check path length for the cases where the 980 /// iteration count of the loop is so small that the main vector loop is 981 /// completely skipped. 982 EpilogueLoopVectorizationInfo &EPI; 983 }; 984 985 /// A specialized derived class of inner loop vectorizer that performs 986 /// vectorization of *main* loops in the process of vectorizing loops and their 987 /// epilogues. 988 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 989 public: 990 EpilogueVectorizerMainLoop( 991 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 992 DominatorTree *DT, const TargetLibraryInfo *TLI, 993 const TargetTransformInfo *TTI, AssumptionCache *AC, 994 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 995 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 996 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 997 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 998 EPI, LVL, CM, BFI, PSI) {} 999 /// Implements the interface for creating a vectorized skeleton using the 1000 /// *main loop* strategy (ie the first pass of vplan execution). 1001 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1002 1003 protected: 1004 /// Emits an iteration count bypass check once for the main loop (when \p 1005 /// ForEpilogue is false) and once for the epilogue loop (when \p 1006 /// ForEpilogue is true). 1007 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1008 bool ForEpilogue); 1009 void printDebugTracesAtStart() override; 1010 void printDebugTracesAtEnd() override; 1011 }; 1012 1013 // A specialized derived class of inner loop vectorizer that performs 1014 // vectorization of *epilogue* loops in the process of vectorizing loops and 1015 // their epilogues. 1016 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1017 public: 1018 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1019 LoopInfo *LI, DominatorTree *DT, 1020 const TargetLibraryInfo *TLI, 1021 const TargetTransformInfo *TTI, AssumptionCache *AC, 1022 OptimizationRemarkEmitter *ORE, 1023 EpilogueLoopVectorizationInfo &EPI, 1024 LoopVectorizationLegality *LVL, 1025 llvm::LoopVectorizationCostModel *CM, 1026 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1027 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1028 EPI, LVL, CM, BFI, PSI) {} 1029 /// Implements the interface for creating a vectorized skeleton using the 1030 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1031 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1032 1033 protected: 1034 /// Emits an iteration count bypass check after the main vector loop has 1035 /// finished to see if there are any iterations left to execute by either 1036 /// the vector epilogue or the scalar epilogue. 1037 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1038 BasicBlock *Bypass, 1039 BasicBlock *Insert); 1040 void printDebugTracesAtStart() override; 1041 void printDebugTracesAtEnd() override; 1042 }; 1043 } // end namespace llvm 1044 1045 /// Look for a meaningful debug location on the instruction or it's 1046 /// operands. 1047 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1048 if (!I) 1049 return I; 1050 1051 DebugLoc Empty; 1052 if (I->getDebugLoc() != Empty) 1053 return I; 1054 1055 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1056 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1057 if (OpInst->getDebugLoc() != Empty) 1058 return OpInst; 1059 } 1060 1061 return I; 1062 } 1063 1064 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1065 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1066 const DILocation *DIL = Inst->getDebugLoc(); 1067 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1068 !isa<DbgInfoIntrinsic>(Inst)) { 1069 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1070 auto NewDIL = 1071 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1072 if (NewDIL) 1073 B.SetCurrentDebugLocation(NewDIL.getValue()); 1074 else 1075 LLVM_DEBUG(dbgs() 1076 << "Failed to create new discriminator: " 1077 << DIL->getFilename() << " Line: " << DIL->getLine()); 1078 } 1079 else 1080 B.SetCurrentDebugLocation(DIL); 1081 } else 1082 B.SetCurrentDebugLocation(DebugLoc()); 1083 } 1084 1085 /// Write a record \p DebugMsg about vectorization failure to the debug 1086 /// output stream. If \p I is passed, it is an instruction that prevents 1087 /// vectorization. 1088 #ifndef NDEBUG 1089 static void debugVectorizationFailure(const StringRef DebugMsg, 1090 Instruction *I) { 1091 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1092 if (I != nullptr) 1093 dbgs() << " " << *I; 1094 else 1095 dbgs() << '.'; 1096 dbgs() << '\n'; 1097 } 1098 #endif 1099 1100 /// Create an analysis remark that explains why vectorization failed 1101 /// 1102 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1103 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1104 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1105 /// the location of the remark. \return the remark object that can be 1106 /// streamed to. 1107 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1108 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1109 Value *CodeRegion = TheLoop->getHeader(); 1110 DebugLoc DL = TheLoop->getStartLoc(); 1111 1112 if (I) { 1113 CodeRegion = I->getParent(); 1114 // If there is no debug location attached to the instruction, revert back to 1115 // using the loop's. 1116 if (I->getDebugLoc()) 1117 DL = I->getDebugLoc(); 1118 } 1119 1120 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1121 R << "loop not vectorized: "; 1122 return R; 1123 } 1124 1125 namespace llvm { 1126 1127 void reportVectorizationFailure(const StringRef DebugMsg, 1128 const StringRef OREMsg, const StringRef ORETag, 1129 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1130 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1131 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1132 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1133 ORETag, TheLoop, I) << OREMsg); 1134 } 1135 1136 } // end namespace llvm 1137 1138 #ifndef NDEBUG 1139 /// \return string containing a file name and a line # for the given loop. 1140 static std::string getDebugLocString(const Loop *L) { 1141 std::string Result; 1142 if (L) { 1143 raw_string_ostream OS(Result); 1144 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1145 LoopDbgLoc.print(OS); 1146 else 1147 // Just print the module name. 1148 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1149 OS.flush(); 1150 } 1151 return Result; 1152 } 1153 #endif 1154 1155 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1156 const Instruction *Orig) { 1157 // If the loop was versioned with memchecks, add the corresponding no-alias 1158 // metadata. 1159 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1160 LVer->annotateInstWithNoAlias(To, Orig); 1161 } 1162 1163 void InnerLoopVectorizer::addMetadata(Instruction *To, 1164 Instruction *From) { 1165 propagateMetadata(To, From); 1166 addNewMetadata(To, From); 1167 } 1168 1169 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1170 Instruction *From) { 1171 for (Value *V : To) { 1172 if (Instruction *I = dyn_cast<Instruction>(V)) 1173 addMetadata(I, From); 1174 } 1175 } 1176 1177 namespace llvm { 1178 1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1180 // lowered. 1181 enum ScalarEpilogueLowering { 1182 1183 // The default: allowing scalar epilogues. 1184 CM_ScalarEpilogueAllowed, 1185 1186 // Vectorization with OptForSize: don't allow epilogues. 1187 CM_ScalarEpilogueNotAllowedOptSize, 1188 1189 // A special case of vectorisation with OptForSize: loops with a very small 1190 // trip count are considered for vectorization under OptForSize, thereby 1191 // making sure the cost of their loop body is dominant, free of runtime 1192 // guards and scalar iteration overheads. 1193 CM_ScalarEpilogueNotAllowedLowTripLoop, 1194 1195 // Loop hint predicate indicating an epilogue is undesired. 1196 CM_ScalarEpilogueNotNeededUsePredicate 1197 }; 1198 1199 /// LoopVectorizationCostModel - estimates the expected speedups due to 1200 /// vectorization. 1201 /// In many cases vectorization is not profitable. This can happen because of 1202 /// a number of reasons. In this class we mainly attempt to predict the 1203 /// expected speedup/slowdowns due to the supported instruction set. We use the 1204 /// TargetTransformInfo to query the different backends for the cost of 1205 /// different operations. 1206 class LoopVectorizationCostModel { 1207 public: 1208 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1209 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1210 LoopVectorizationLegality *Legal, 1211 const TargetTransformInfo &TTI, 1212 const TargetLibraryInfo *TLI, DemandedBits *DB, 1213 AssumptionCache *AC, 1214 OptimizationRemarkEmitter *ORE, const Function *F, 1215 const LoopVectorizeHints *Hints, 1216 InterleavedAccessInfo &IAI) 1217 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1218 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1219 Hints(Hints), InterleaveInfo(IAI) {} 1220 1221 /// \return An upper bound for the vectorization factor, or None if 1222 /// vectorization and interleaving should be avoided up front. 1223 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1224 1225 /// \return True if runtime checks are required for vectorization, and false 1226 /// otherwise. 1227 bool runtimeChecksRequired(); 1228 1229 /// \return The most profitable vectorization factor and the cost of that VF. 1230 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1231 /// then this vectorization factor will be selected if vectorization is 1232 /// possible. 1233 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1234 VectorizationFactor 1235 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1236 const LoopVectorizationPlanner &LVP); 1237 1238 /// Setup cost-based decisions for user vectorization factor. 1239 void selectUserVectorizationFactor(ElementCount UserVF) { 1240 collectUniformsAndScalars(UserVF); 1241 collectInstsToScalarize(UserVF); 1242 } 1243 1244 /// \return The size (in bits) of the smallest and widest types in the code 1245 /// that needs to be vectorized. We ignore values that remain scalar such as 1246 /// 64 bit loop indices. 1247 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1248 1249 /// \return The desired interleave count. 1250 /// If interleave count has been specified by metadata it will be returned. 1251 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1252 /// are the selected vectorization factor and the cost of the selected VF. 1253 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1254 1255 /// Memory access instruction may be vectorized in more than one way. 1256 /// Form of instruction after vectorization depends on cost. 1257 /// This function takes cost-based decisions for Load/Store instructions 1258 /// and collects them in a map. This decisions map is used for building 1259 /// the lists of loop-uniform and loop-scalar instructions. 1260 /// The calculated cost is saved with widening decision in order to 1261 /// avoid redundant calculations. 1262 void setCostBasedWideningDecision(ElementCount VF); 1263 1264 /// A struct that represents some properties of the register usage 1265 /// of a loop. 1266 struct RegisterUsage { 1267 /// Holds the number of loop invariant values that are used in the loop. 1268 /// The key is ClassID of target-provided register class. 1269 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1270 /// Holds the maximum number of concurrent live intervals in the loop. 1271 /// The key is ClassID of target-provided register class. 1272 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1273 }; 1274 1275 /// \return Returns information about the register usages of the loop for the 1276 /// given vectorization factors. 1277 SmallVector<RegisterUsage, 8> 1278 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1279 1280 /// Collect values we want to ignore in the cost model. 1281 void collectValuesToIgnore(); 1282 1283 /// Split reductions into those that happen in the loop, and those that happen 1284 /// outside. In loop reductions are collected into InLoopReductionChains. 1285 void collectInLoopReductions(); 1286 1287 /// \returns The smallest bitwidth each instruction can be represented with. 1288 /// The vector equivalents of these instructions should be truncated to this 1289 /// type. 1290 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1291 return MinBWs; 1292 } 1293 1294 /// \returns True if it is more profitable to scalarize instruction \p I for 1295 /// vectorization factor \p VF. 1296 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1297 assert(VF.isVector() && 1298 "Profitable to scalarize relevant only for VF > 1."); 1299 1300 // Cost model is not run in the VPlan-native path - return conservative 1301 // result until this changes. 1302 if (EnableVPlanNativePath) 1303 return false; 1304 1305 auto Scalars = InstsToScalarize.find(VF); 1306 assert(Scalars != InstsToScalarize.end() && 1307 "VF not yet analyzed for scalarization profitability"); 1308 return Scalars->second.find(I) != Scalars->second.end(); 1309 } 1310 1311 /// Returns true if \p I is known to be uniform after vectorization. 1312 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1313 if (VF.isScalar()) 1314 return true; 1315 1316 // Cost model is not run in the VPlan-native path - return conservative 1317 // result until this changes. 1318 if (EnableVPlanNativePath) 1319 return false; 1320 1321 auto UniformsPerVF = Uniforms.find(VF); 1322 assert(UniformsPerVF != Uniforms.end() && 1323 "VF not yet analyzed for uniformity"); 1324 return UniformsPerVF->second.count(I); 1325 } 1326 1327 /// Returns true if \p I is known to be scalar after vectorization. 1328 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1329 if (VF.isScalar()) 1330 return true; 1331 1332 // Cost model is not run in the VPlan-native path - return conservative 1333 // result until this changes. 1334 if (EnableVPlanNativePath) 1335 return false; 1336 1337 auto ScalarsPerVF = Scalars.find(VF); 1338 assert(ScalarsPerVF != Scalars.end() && 1339 "Scalar values are not calculated for VF"); 1340 return ScalarsPerVF->second.count(I); 1341 } 1342 1343 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1344 /// for vectorization factor \p VF. 1345 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1346 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1347 !isProfitableToScalarize(I, VF) && 1348 !isScalarAfterVectorization(I, VF); 1349 } 1350 1351 /// Decision that was taken during cost calculation for memory instruction. 1352 enum InstWidening { 1353 CM_Unknown, 1354 CM_Widen, // For consecutive accesses with stride +1. 1355 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1356 CM_Interleave, 1357 CM_GatherScatter, 1358 CM_Scalarize 1359 }; 1360 1361 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1362 /// instruction \p I and vector width \p VF. 1363 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1364 unsigned Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1367 } 1368 1369 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1370 /// interleaving group \p Grp and vector width \p VF. 1371 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1372 ElementCount VF, InstWidening W, unsigned Cost) { 1373 assert(VF.isVector() && "Expected VF >=2"); 1374 /// Broadcast this decicion to all instructions inside the group. 1375 /// But the cost will be assigned to one instruction only. 1376 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1377 if (auto *I = Grp->getMember(i)) { 1378 if (Grp->getInsertPos() == I) 1379 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1380 else 1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1382 } 1383 } 1384 } 1385 1386 /// Return the cost model decision for the given instruction \p I and vector 1387 /// width \p VF. Return CM_Unknown if this instruction did not pass 1388 /// through the cost modeling. 1389 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1390 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1391 assert(VF.isVector() && "Expected VF >=2"); 1392 1393 // Cost model is not run in the VPlan-native path - return conservative 1394 // result until this changes. 1395 if (EnableVPlanNativePath) 1396 return CM_GatherScatter; 1397 1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1399 auto Itr = WideningDecisions.find(InstOnVF); 1400 if (Itr == WideningDecisions.end()) 1401 return CM_Unknown; 1402 return Itr->second.first; 1403 } 1404 1405 /// Return the vectorization cost for the given instruction \p I and vector 1406 /// width \p VF. 1407 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1408 assert(VF.isVector() && "Expected VF >=2"); 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1411 "The cost is not calculated"); 1412 return WideningDecisions[InstOnVF].second; 1413 } 1414 1415 /// Return True if instruction \p I is an optimizable truncate whose operand 1416 /// is an induction variable. Such a truncate will be removed by adding a new 1417 /// induction variable with the destination type. 1418 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1419 // If the instruction is not a truncate, return false. 1420 auto *Trunc = dyn_cast<TruncInst>(I); 1421 if (!Trunc) 1422 return false; 1423 1424 // Get the source and destination types of the truncate. 1425 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1426 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1427 1428 // If the truncate is free for the given types, return false. Replacing a 1429 // free truncate with an induction variable would add an induction variable 1430 // update instruction to each iteration of the loop. We exclude from this 1431 // check the primary induction variable since it will need an update 1432 // instruction regardless. 1433 Value *Op = Trunc->getOperand(0); 1434 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1435 return false; 1436 1437 // If the truncated value is not an induction variable, return false. 1438 return Legal->isInductionPhi(Op); 1439 } 1440 1441 /// Collects the instructions to scalarize for each predicated instruction in 1442 /// the loop. 1443 void collectInstsToScalarize(ElementCount VF); 1444 1445 /// Collect Uniform and Scalar values for the given \p VF. 1446 /// The sets depend on CM decision for Load/Store instructions 1447 /// that may be vectorized as interleave, gather-scatter or scalarized. 1448 void collectUniformsAndScalars(ElementCount VF) { 1449 // Do the analysis once. 1450 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1451 return; 1452 setCostBasedWideningDecision(VF); 1453 collectLoopUniforms(VF); 1454 collectLoopScalars(VF); 1455 } 1456 1457 /// Returns true if the target machine supports masked store operation 1458 /// for the given \p DataType and kind of access to \p Ptr. 1459 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1460 return Legal->isConsecutivePtr(Ptr) && 1461 TTI.isLegalMaskedStore(DataType, Alignment); 1462 } 1463 1464 /// Returns true if the target machine supports masked load operation 1465 /// for the given \p DataType and kind of access to \p Ptr. 1466 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1467 return Legal->isConsecutivePtr(Ptr) && 1468 TTI.isLegalMaskedLoad(DataType, Alignment); 1469 } 1470 1471 /// Returns true if the target machine supports masked scatter operation 1472 /// for the given \p DataType. 1473 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1474 return TTI.isLegalMaskedScatter(DataType, Alignment); 1475 } 1476 1477 /// Returns true if the target machine supports masked gather operation 1478 /// for the given \p DataType. 1479 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1480 return TTI.isLegalMaskedGather(DataType, Alignment); 1481 } 1482 1483 /// Returns true if the target machine can represent \p V as a masked gather 1484 /// or scatter operation. 1485 bool isLegalGatherOrScatter(Value *V) { 1486 bool LI = isa<LoadInst>(V); 1487 bool SI = isa<StoreInst>(V); 1488 if (!LI && !SI) 1489 return false; 1490 auto *Ty = getMemInstValueType(V); 1491 Align Align = getLoadStoreAlignment(V); 1492 return (LI && isLegalMaskedGather(Ty, Align)) || 1493 (SI && isLegalMaskedScatter(Ty, Align)); 1494 } 1495 1496 /// Returns true if \p I is an instruction that will be scalarized with 1497 /// predication. Such instructions include conditional stores and 1498 /// instructions that may divide by zero. 1499 /// If a non-zero VF has been calculated, we check if I will be scalarized 1500 /// predication for that VF. 1501 bool isScalarWithPredication(Instruction *I, 1502 ElementCount VF = ElementCount::getFixed(1)); 1503 1504 // Returns true if \p I is an instruction that will be predicated either 1505 // through scalar predication or masked load/store or masked gather/scatter. 1506 // Superset of instructions that return true for isScalarWithPredication. 1507 bool isPredicatedInst(Instruction *I) { 1508 if (!blockNeedsPredication(I->getParent())) 1509 return false; 1510 // Loads and stores that need some form of masked operation are predicated 1511 // instructions. 1512 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1513 return Legal->isMaskRequired(I); 1514 return isScalarWithPredication(I); 1515 } 1516 1517 /// Returns true if \p I is a memory instruction with consecutive memory 1518 /// access that can be widened. 1519 bool 1520 memoryInstructionCanBeWidened(Instruction *I, 1521 ElementCount VF = ElementCount::getFixed(1)); 1522 1523 /// Returns true if \p I is a memory instruction in an interleaved-group 1524 /// of memory accesses that can be vectorized with wide vector loads/stores 1525 /// and shuffles. 1526 bool 1527 interleavedAccessCanBeWidened(Instruction *I, 1528 ElementCount VF = ElementCount::getFixed(1)); 1529 1530 /// Check if \p Instr belongs to any interleaved access group. 1531 bool isAccessInterleaved(Instruction *Instr) { 1532 return InterleaveInfo.isInterleaved(Instr); 1533 } 1534 1535 /// Get the interleaved access group that \p Instr belongs to. 1536 const InterleaveGroup<Instruction> * 1537 getInterleavedAccessGroup(Instruction *Instr) { 1538 return InterleaveInfo.getInterleaveGroup(Instr); 1539 } 1540 1541 /// Returns true if an interleaved group requires a scalar iteration 1542 /// to handle accesses with gaps, and there is nothing preventing us from 1543 /// creating a scalar epilogue. 1544 bool requiresScalarEpilogue() const { 1545 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1546 } 1547 1548 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1549 /// loop hint annotation. 1550 bool isScalarEpilogueAllowed() const { 1551 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1552 } 1553 1554 /// Returns true if all loop blocks should be masked to fold tail loop. 1555 bool foldTailByMasking() const { return FoldTailByMasking; } 1556 1557 bool blockNeedsPredication(BasicBlock *BB) { 1558 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1559 } 1560 1561 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1562 /// nodes to the chain of instructions representing the reductions. Uses a 1563 /// MapVector to ensure deterministic iteration order. 1564 using ReductionChainMap = 1565 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1566 1567 /// Return the chain of instructions representing an inloop reduction. 1568 const ReductionChainMap &getInLoopReductionChains() const { 1569 return InLoopReductionChains; 1570 } 1571 1572 /// Returns true if the Phi is part of an inloop reduction. 1573 bool isInLoopReduction(PHINode *Phi) const { 1574 return InLoopReductionChains.count(Phi); 1575 } 1576 1577 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1578 /// with factor VF. Return the cost of the instruction, including 1579 /// scalarization overhead if it's needed. 1580 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1581 1582 /// Estimate cost of a call instruction CI if it were vectorized with factor 1583 /// VF. Return the cost of the instruction, including scalarization overhead 1584 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1585 /// scalarized - 1586 /// i.e. either vector version isn't available, or is too expensive. 1587 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1588 bool &NeedToScalarize); 1589 1590 /// Invalidates decisions already taken by the cost model. 1591 void invalidateCostModelingDecisions() { 1592 WideningDecisions.clear(); 1593 Uniforms.clear(); 1594 Scalars.clear(); 1595 } 1596 1597 private: 1598 unsigned NumPredStores = 0; 1599 1600 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1601 /// than zero. One is returned if vectorization should best be avoided due 1602 /// to cost. 1603 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1604 ElementCount UserVF); 1605 1606 /// The vectorization cost is a combination of the cost itself and a boolean 1607 /// indicating whether any of the contributing operations will actually 1608 /// operate on 1609 /// vector values after type legalization in the backend. If this latter value 1610 /// is 1611 /// false, then all operations will be scalarized (i.e. no vectorization has 1612 /// actually taken place). 1613 using VectorizationCostTy = std::pair<unsigned, bool>; 1614 1615 /// Returns the expected execution cost. The unit of the cost does 1616 /// not matter because we use the 'cost' units to compare different 1617 /// vector widths. The cost that is returned is *not* normalized by 1618 /// the factor width. 1619 VectorizationCostTy expectedCost(ElementCount VF); 1620 1621 /// Returns the execution time cost of an instruction for a given vector 1622 /// width. Vector width of one means scalar. 1623 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1624 1625 /// The cost-computation logic from getInstructionCost which provides 1626 /// the vector type as an output parameter. 1627 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1628 1629 /// Calculate vectorization cost of memory instruction \p I. 1630 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1631 1632 /// The cost computation for scalarized memory instruction. 1633 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1634 1635 /// The cost computation for interleaving group of memory instructions. 1636 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1637 1638 /// The cost computation for Gather/Scatter instruction. 1639 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost computation for widening instruction \p I with consecutive 1642 /// memory access. 1643 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1644 1645 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1646 /// Load: scalar load + broadcast. 1647 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1648 /// element) 1649 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1650 1651 /// Estimate the overhead of scalarizing an instruction. This is a 1652 /// convenience wrapper for the type-based getScalarizationOverhead API. 1653 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1654 1655 /// Returns whether the instruction is a load or store and will be a emitted 1656 /// as a vector operation. 1657 bool isConsecutiveLoadOrStore(Instruction *I); 1658 1659 /// Returns true if an artificially high cost for emulated masked memrefs 1660 /// should be used. 1661 bool useEmulatedMaskMemRefHack(Instruction *I); 1662 1663 /// Map of scalar integer values to the smallest bitwidth they can be legally 1664 /// represented as. The vector equivalents of these values should be truncated 1665 /// to this type. 1666 MapVector<Instruction *, uint64_t> MinBWs; 1667 1668 /// A type representing the costs for instructions if they were to be 1669 /// scalarized rather than vectorized. The entries are Instruction-Cost 1670 /// pairs. 1671 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1672 1673 /// A set containing all BasicBlocks that are known to present after 1674 /// vectorization as a predicated block. 1675 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1676 1677 /// Records whether it is allowed to have the original scalar loop execute at 1678 /// least once. This may be needed as a fallback loop in case runtime 1679 /// aliasing/dependence checks fail, or to handle the tail/remainder 1680 /// iterations when the trip count is unknown or doesn't divide by the VF, 1681 /// or as a peel-loop to handle gaps in interleave-groups. 1682 /// Under optsize and when the trip count is very small we don't allow any 1683 /// iterations to execute in the scalar loop. 1684 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1685 1686 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1687 bool FoldTailByMasking = false; 1688 1689 /// A map holding scalar costs for different vectorization factors. The 1690 /// presence of a cost for an instruction in the mapping indicates that the 1691 /// instruction will be scalarized when vectorizing with the associated 1692 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1693 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1694 1695 /// Holds the instructions known to be uniform after vectorization. 1696 /// The data is collected per VF. 1697 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1698 1699 /// Holds the instructions known to be scalar after vectorization. 1700 /// The data is collected per VF. 1701 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1702 1703 /// Holds the instructions (address computations) that are forced to be 1704 /// scalarized. 1705 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1706 1707 /// PHINodes of the reductions that should be expanded in-loop along with 1708 /// their associated chains of reduction operations, in program order from top 1709 /// (PHI) to bottom 1710 ReductionChainMap InLoopReductionChains; 1711 1712 /// Returns the expected difference in cost from scalarizing the expression 1713 /// feeding a predicated instruction \p PredInst. The instructions to 1714 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1715 /// non-negative return value implies the expression will be scalarized. 1716 /// Currently, only single-use chains are considered for scalarization. 1717 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1718 ElementCount VF); 1719 1720 /// Collect the instructions that are uniform after vectorization. An 1721 /// instruction is uniform if we represent it with a single scalar value in 1722 /// the vectorized loop corresponding to each vector iteration. Examples of 1723 /// uniform instructions include pointer operands of consecutive or 1724 /// interleaved memory accesses. Note that although uniformity implies an 1725 /// instruction will be scalar, the reverse is not true. In general, a 1726 /// scalarized instruction will be represented by VF scalar values in the 1727 /// vectorized loop, each corresponding to an iteration of the original 1728 /// scalar loop. 1729 void collectLoopUniforms(ElementCount VF); 1730 1731 /// Collect the instructions that are scalar after vectorization. An 1732 /// instruction is scalar if it is known to be uniform or will be scalarized 1733 /// during vectorization. Non-uniform scalarized instructions will be 1734 /// represented by VF values in the vectorized loop, each corresponding to an 1735 /// iteration of the original scalar loop. 1736 void collectLoopScalars(ElementCount VF); 1737 1738 /// Keeps cost model vectorization decision and cost for instructions. 1739 /// Right now it is used for memory instructions only. 1740 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1741 std::pair<InstWidening, unsigned>>; 1742 1743 DecisionList WideningDecisions; 1744 1745 /// Returns true if \p V is expected to be vectorized and it needs to be 1746 /// extracted. 1747 bool needsExtract(Value *V, ElementCount VF) const { 1748 Instruction *I = dyn_cast<Instruction>(V); 1749 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1750 TheLoop->isLoopInvariant(I)) 1751 return false; 1752 1753 // Assume we can vectorize V (and hence we need extraction) if the 1754 // scalars are not computed yet. This can happen, because it is called 1755 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1756 // the scalars are collected. That should be a safe assumption in most 1757 // cases, because we check if the operands have vectorizable types 1758 // beforehand in LoopVectorizationLegality. 1759 return Scalars.find(VF) == Scalars.end() || 1760 !isScalarAfterVectorization(I, VF); 1761 }; 1762 1763 /// Returns a range containing only operands needing to be extracted. 1764 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1765 ElementCount VF) { 1766 return SmallVector<Value *, 4>(make_filter_range( 1767 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1768 } 1769 1770 /// Determines if we have the infrastructure to vectorize loop \p L and its 1771 /// epilogue, assuming the main loop is vectorized by \p VF. 1772 bool isCandidateForEpilogueVectorization(const Loop &L, 1773 const ElementCount VF) const; 1774 1775 /// Returns true if epilogue vectorization is considered profitable, and 1776 /// false otherwise. 1777 /// \p VF is the vectorization factor chosen for the original loop. 1778 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1779 1780 public: 1781 /// The loop that we evaluate. 1782 Loop *TheLoop; 1783 1784 /// Predicated scalar evolution analysis. 1785 PredicatedScalarEvolution &PSE; 1786 1787 /// Loop Info analysis. 1788 LoopInfo *LI; 1789 1790 /// Vectorization legality. 1791 LoopVectorizationLegality *Legal; 1792 1793 /// Vector target information. 1794 const TargetTransformInfo &TTI; 1795 1796 /// Target Library Info. 1797 const TargetLibraryInfo *TLI; 1798 1799 /// Demanded bits analysis. 1800 DemandedBits *DB; 1801 1802 /// Assumption cache. 1803 AssumptionCache *AC; 1804 1805 /// Interface to emit optimization remarks. 1806 OptimizationRemarkEmitter *ORE; 1807 1808 const Function *TheFunction; 1809 1810 /// Loop Vectorize Hint. 1811 const LoopVectorizeHints *Hints; 1812 1813 /// The interleave access information contains groups of interleaved accesses 1814 /// with the same stride and close to each other. 1815 InterleavedAccessInfo &InterleaveInfo; 1816 1817 /// Values to ignore in the cost model. 1818 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1819 1820 /// Values to ignore in the cost model when VF > 1. 1821 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1822 1823 /// Profitable vector factors. 1824 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1825 }; 1826 1827 } // end namespace llvm 1828 1829 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1830 // vectorization. The loop needs to be annotated with #pragma omp simd 1831 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1832 // vector length information is not provided, vectorization is not considered 1833 // explicit. Interleave hints are not allowed either. These limitations will be 1834 // relaxed in the future. 1835 // Please, note that we are currently forced to abuse the pragma 'clang 1836 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1837 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1838 // provides *explicit vectorization hints* (LV can bypass legal checks and 1839 // assume that vectorization is legal). However, both hints are implemented 1840 // using the same metadata (llvm.loop.vectorize, processed by 1841 // LoopVectorizeHints). This will be fixed in the future when the native IR 1842 // representation for pragma 'omp simd' is introduced. 1843 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1844 OptimizationRemarkEmitter *ORE) { 1845 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1846 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1847 1848 // Only outer loops with an explicit vectorization hint are supported. 1849 // Unannotated outer loops are ignored. 1850 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1851 return false; 1852 1853 Function *Fn = OuterLp->getHeader()->getParent(); 1854 if (!Hints.allowVectorization(Fn, OuterLp, 1855 true /*VectorizeOnlyWhenForced*/)) { 1856 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1857 return false; 1858 } 1859 1860 if (Hints.getInterleave() > 1) { 1861 // TODO: Interleave support is future work. 1862 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1863 "outer loops.\n"); 1864 Hints.emitRemarkWithHints(); 1865 return false; 1866 } 1867 1868 return true; 1869 } 1870 1871 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1872 OptimizationRemarkEmitter *ORE, 1873 SmallVectorImpl<Loop *> &V) { 1874 // Collect inner loops and outer loops without irreducible control flow. For 1875 // now, only collect outer loops that have explicit vectorization hints. If we 1876 // are stress testing the VPlan H-CFG construction, we collect the outermost 1877 // loop of every loop nest. 1878 if (L.isInnermost() || VPlanBuildStressTest || 1879 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1880 LoopBlocksRPO RPOT(&L); 1881 RPOT.perform(LI); 1882 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1883 V.push_back(&L); 1884 // TODO: Collect inner loops inside marked outer loops in case 1885 // vectorization fails for the outer loop. Do not invoke 1886 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1887 // already known to be reducible. We can use an inherited attribute for 1888 // that. 1889 return; 1890 } 1891 } 1892 for (Loop *InnerL : L) 1893 collectSupportedLoops(*InnerL, LI, ORE, V); 1894 } 1895 1896 namespace { 1897 1898 /// The LoopVectorize Pass. 1899 struct LoopVectorize : public FunctionPass { 1900 /// Pass identification, replacement for typeid 1901 static char ID; 1902 1903 LoopVectorizePass Impl; 1904 1905 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1906 bool VectorizeOnlyWhenForced = false) 1907 : FunctionPass(ID), 1908 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1909 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1910 } 1911 1912 bool runOnFunction(Function &F) override { 1913 if (skipFunction(F)) 1914 return false; 1915 1916 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1917 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1918 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1919 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1920 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1921 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1922 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1923 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1924 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1925 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1926 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1927 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1928 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1929 1930 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1931 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1932 1933 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1934 GetLAA, *ORE, PSI).MadeAnyChange; 1935 } 1936 1937 void getAnalysisUsage(AnalysisUsage &AU) const override { 1938 AU.addRequired<AssumptionCacheTracker>(); 1939 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1940 AU.addRequired<DominatorTreeWrapperPass>(); 1941 AU.addRequired<LoopInfoWrapperPass>(); 1942 AU.addRequired<ScalarEvolutionWrapperPass>(); 1943 AU.addRequired<TargetTransformInfoWrapperPass>(); 1944 AU.addRequired<AAResultsWrapperPass>(); 1945 AU.addRequired<LoopAccessLegacyAnalysis>(); 1946 AU.addRequired<DemandedBitsWrapperPass>(); 1947 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1948 AU.addRequired<InjectTLIMappingsLegacy>(); 1949 1950 // We currently do not preserve loopinfo/dominator analyses with outer loop 1951 // vectorization. Until this is addressed, mark these analyses as preserved 1952 // only for non-VPlan-native path. 1953 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1954 if (!EnableVPlanNativePath) { 1955 AU.addPreserved<LoopInfoWrapperPass>(); 1956 AU.addPreserved<DominatorTreeWrapperPass>(); 1957 } 1958 1959 AU.addPreserved<BasicAAWrapperPass>(); 1960 AU.addPreserved<GlobalsAAWrapperPass>(); 1961 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1962 } 1963 }; 1964 1965 } // end anonymous namespace 1966 1967 //===----------------------------------------------------------------------===// 1968 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1969 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1970 //===----------------------------------------------------------------------===// 1971 1972 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1973 // We need to place the broadcast of invariant variables outside the loop, 1974 // but only if it's proven safe to do so. Else, broadcast will be inside 1975 // vector loop body. 1976 Instruction *Instr = dyn_cast<Instruction>(V); 1977 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1978 (!Instr || 1979 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1980 // Place the code for broadcasting invariant variables in the new preheader. 1981 IRBuilder<>::InsertPointGuard Guard(Builder); 1982 if (SafeToHoist) 1983 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1984 1985 // Broadcast the scalar into all locations in the vector. 1986 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1987 1988 return Shuf; 1989 } 1990 1991 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1992 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1993 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1994 "Expected either an induction phi-node or a truncate of it!"); 1995 Value *Start = II.getStartValue(); 1996 1997 // Construct the initial value of the vector IV in the vector loop preheader 1998 auto CurrIP = Builder.saveIP(); 1999 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2000 if (isa<TruncInst>(EntryVal)) { 2001 assert(Start->getType()->isIntegerTy() && 2002 "Truncation requires an integer type"); 2003 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2004 Step = Builder.CreateTrunc(Step, TruncType); 2005 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2006 } 2007 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2008 Value *SteppedStart = 2009 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2010 2011 // We create vector phi nodes for both integer and floating-point induction 2012 // variables. Here, we determine the kind of arithmetic we will perform. 2013 Instruction::BinaryOps AddOp; 2014 Instruction::BinaryOps MulOp; 2015 if (Step->getType()->isIntegerTy()) { 2016 AddOp = Instruction::Add; 2017 MulOp = Instruction::Mul; 2018 } else { 2019 AddOp = II.getInductionOpcode(); 2020 MulOp = Instruction::FMul; 2021 } 2022 2023 // Multiply the vectorization factor by the step using integer or 2024 // floating-point arithmetic as appropriate. 2025 Value *ConstVF = 2026 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2027 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2028 2029 // Create a vector splat to use in the induction update. 2030 // 2031 // FIXME: If the step is non-constant, we create the vector splat with 2032 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2033 // handle a constant vector splat. 2034 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2035 Value *SplatVF = isa<Constant>(Mul) 2036 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2037 : Builder.CreateVectorSplat(VF, Mul); 2038 Builder.restoreIP(CurrIP); 2039 2040 // We may need to add the step a number of times, depending on the unroll 2041 // factor. The last of those goes into the PHI. 2042 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2043 &*LoopVectorBody->getFirstInsertionPt()); 2044 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2045 Instruction *LastInduction = VecInd; 2046 for (unsigned Part = 0; Part < UF; ++Part) { 2047 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2048 2049 if (isa<TruncInst>(EntryVal)) 2050 addMetadata(LastInduction, EntryVal); 2051 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2052 2053 LastInduction = cast<Instruction>(addFastMathFlag( 2054 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2055 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2056 } 2057 2058 // Move the last step to the end of the latch block. This ensures consistent 2059 // placement of all induction updates. 2060 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2061 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2062 auto *ICmp = cast<Instruction>(Br->getCondition()); 2063 LastInduction->moveBefore(ICmp); 2064 LastInduction->setName("vec.ind.next"); 2065 2066 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2067 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2068 } 2069 2070 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2071 return Cost->isScalarAfterVectorization(I, VF) || 2072 Cost->isProfitableToScalarize(I, VF); 2073 } 2074 2075 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2076 if (shouldScalarizeInstruction(IV)) 2077 return true; 2078 auto isScalarInst = [&](User *U) -> bool { 2079 auto *I = cast<Instruction>(U); 2080 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2081 }; 2082 return llvm::any_of(IV->users(), isScalarInst); 2083 } 2084 2085 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2086 const InductionDescriptor &ID, const Instruction *EntryVal, 2087 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2088 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2089 "Expected either an induction phi-node or a truncate of it!"); 2090 2091 // This induction variable is not the phi from the original loop but the 2092 // newly-created IV based on the proof that casted Phi is equal to the 2093 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2094 // re-uses the same InductionDescriptor that original IV uses but we don't 2095 // have to do any recording in this case - that is done when original IV is 2096 // processed. 2097 if (isa<TruncInst>(EntryVal)) 2098 return; 2099 2100 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2101 if (Casts.empty()) 2102 return; 2103 // Only the first Cast instruction in the Casts vector is of interest. 2104 // The rest of the Casts (if exist) have no uses outside the 2105 // induction update chain itself. 2106 Instruction *CastInst = *Casts.begin(); 2107 if (Lane < UINT_MAX) 2108 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2109 else 2110 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2111 } 2112 2113 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 2114 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2115 "Primary induction variable must have an integer type"); 2116 2117 auto II = Legal->getInductionVars().find(IV); 2118 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2119 2120 auto ID = II->second; 2121 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2122 2123 // The value from the original loop to which we are mapping the new induction 2124 // variable. 2125 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2126 2127 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2128 2129 // Generate code for the induction step. Note that induction steps are 2130 // required to be loop-invariant 2131 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2132 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2133 "Induction step should be loop invariant"); 2134 if (PSE.getSE()->isSCEVable(IV->getType())) { 2135 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2136 return Exp.expandCodeFor(Step, Step->getType(), 2137 LoopVectorPreHeader->getTerminator()); 2138 } 2139 return cast<SCEVUnknown>(Step)->getValue(); 2140 }; 2141 2142 // The scalar value to broadcast. This is derived from the canonical 2143 // induction variable. If a truncation type is given, truncate the canonical 2144 // induction variable and step. Otherwise, derive these values from the 2145 // induction descriptor. 2146 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2147 Value *ScalarIV = Induction; 2148 if (IV != OldInduction) { 2149 ScalarIV = IV->getType()->isIntegerTy() 2150 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2151 : Builder.CreateCast(Instruction::SIToFP, Induction, 2152 IV->getType()); 2153 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2154 ScalarIV->setName("offset.idx"); 2155 } 2156 if (Trunc) { 2157 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2158 assert(Step->getType()->isIntegerTy() && 2159 "Truncation requires an integer step"); 2160 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2161 Step = Builder.CreateTrunc(Step, TruncType); 2162 } 2163 return ScalarIV; 2164 }; 2165 2166 // Create the vector values from the scalar IV, in the absence of creating a 2167 // vector IV. 2168 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2169 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2170 for (unsigned Part = 0; Part < UF; ++Part) { 2171 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2172 Value *EntryPart = 2173 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2174 ID.getInductionOpcode()); 2175 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2176 if (Trunc) 2177 addMetadata(EntryPart, Trunc); 2178 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2179 } 2180 }; 2181 2182 // Now do the actual transformations, and start with creating the step value. 2183 Value *Step = CreateStepValue(ID.getStep()); 2184 if (VF.isZero() || VF.isScalar()) { 2185 Value *ScalarIV = CreateScalarIV(Step); 2186 CreateSplatIV(ScalarIV, Step); 2187 return; 2188 } 2189 2190 // Determine if we want a scalar version of the induction variable. This is 2191 // true if the induction variable itself is not widened, or if it has at 2192 // least one user in the loop that is not widened. 2193 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2194 if (!NeedsScalarIV) { 2195 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2196 return; 2197 } 2198 2199 // Try to create a new independent vector induction variable. If we can't 2200 // create the phi node, we will splat the scalar induction variable in each 2201 // loop iteration. 2202 if (!shouldScalarizeInstruction(EntryVal)) { 2203 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2204 Value *ScalarIV = CreateScalarIV(Step); 2205 // Create scalar steps that can be used by instructions we will later 2206 // scalarize. Note that the addition of the scalar steps will not increase 2207 // the number of instructions in the loop in the common case prior to 2208 // InstCombine. We will be trading one vector extract for each scalar step. 2209 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2210 return; 2211 } 2212 2213 // All IV users are scalar instructions, so only emit a scalar IV, not a 2214 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2215 // predicate used by the masked loads/stores. 2216 Value *ScalarIV = CreateScalarIV(Step); 2217 if (!Cost->isScalarEpilogueAllowed()) 2218 CreateSplatIV(ScalarIV, Step); 2219 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2220 } 2221 2222 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2223 Instruction::BinaryOps BinOp) { 2224 // Create and check the types. 2225 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2226 int VLen = ValVTy->getNumElements(); 2227 2228 Type *STy = Val->getType()->getScalarType(); 2229 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2230 "Induction Step must be an integer or FP"); 2231 assert(Step->getType() == STy && "Step has wrong type"); 2232 2233 SmallVector<Constant *, 8> Indices; 2234 2235 if (STy->isIntegerTy()) { 2236 // Create a vector of consecutive numbers from zero to VF. 2237 for (int i = 0; i < VLen; ++i) 2238 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2239 2240 // Add the consecutive indices to the vector value. 2241 Constant *Cv = ConstantVector::get(Indices); 2242 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2243 Step = Builder.CreateVectorSplat(VLen, Step); 2244 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2245 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2246 // which can be found from the original scalar operations. 2247 Step = Builder.CreateMul(Cv, Step); 2248 return Builder.CreateAdd(Val, Step, "induction"); 2249 } 2250 2251 // Floating point induction. 2252 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2253 "Binary Opcode should be specified for FP induction"); 2254 // Create a vector of consecutive numbers from zero to VF. 2255 for (int i = 0; i < VLen; ++i) 2256 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2257 2258 // Add the consecutive indices to the vector value. 2259 Constant *Cv = ConstantVector::get(Indices); 2260 2261 Step = Builder.CreateVectorSplat(VLen, Step); 2262 2263 // Floating point operations had to be 'fast' to enable the induction. 2264 FastMathFlags Flags; 2265 Flags.setFast(); 2266 2267 Value *MulOp = Builder.CreateFMul(Cv, Step); 2268 if (isa<Instruction>(MulOp)) 2269 // Have to check, MulOp may be a constant 2270 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2271 2272 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2273 if (isa<Instruction>(BOp)) 2274 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2275 return BOp; 2276 } 2277 2278 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2279 Instruction *EntryVal, 2280 const InductionDescriptor &ID) { 2281 // We shouldn't have to build scalar steps if we aren't vectorizing. 2282 assert(VF.isVector() && "VF should be greater than one"); 2283 assert(!VF.isScalable() && 2284 "the code below assumes a fixed number of elements at compile time"); 2285 // Get the value type and ensure it and the step have the same integer type. 2286 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2287 assert(ScalarIVTy == Step->getType() && 2288 "Val and Step should have the same type"); 2289 2290 // We build scalar steps for both integer and floating-point induction 2291 // variables. Here, we determine the kind of arithmetic we will perform. 2292 Instruction::BinaryOps AddOp; 2293 Instruction::BinaryOps MulOp; 2294 if (ScalarIVTy->isIntegerTy()) { 2295 AddOp = Instruction::Add; 2296 MulOp = Instruction::Mul; 2297 } else { 2298 AddOp = ID.getInductionOpcode(); 2299 MulOp = Instruction::FMul; 2300 } 2301 2302 // Determine the number of scalars we need to generate for each unroll 2303 // iteration. If EntryVal is uniform, we only need to generate the first 2304 // lane. Otherwise, we generate all VF values. 2305 unsigned Lanes = 2306 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2307 ? 1 2308 : VF.getKnownMinValue(); 2309 // Compute the scalar steps and save the results in VectorLoopValueMap. 2310 for (unsigned Part = 0; Part < UF; ++Part) { 2311 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2312 auto *StartIdx = getSignedIntOrFpConstant( 2313 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2314 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2315 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2316 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2317 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2318 } 2319 } 2320 } 2321 2322 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2323 assert(V != Induction && "The new induction variable should not be used."); 2324 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2325 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2326 2327 // If we have a stride that is replaced by one, do it here. Defer this for 2328 // the VPlan-native path until we start running Legal checks in that path. 2329 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2330 V = ConstantInt::get(V->getType(), 1); 2331 2332 // If we have a vector mapped to this value, return it. 2333 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2334 return VectorLoopValueMap.getVectorValue(V, Part); 2335 2336 // If the value has not been vectorized, check if it has been scalarized 2337 // instead. If it has been scalarized, and we actually need the value in 2338 // vector form, we will construct the vector values on demand. 2339 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2340 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2341 2342 // If we've scalarized a value, that value should be an instruction. 2343 auto *I = cast<Instruction>(V); 2344 2345 // If we aren't vectorizing, we can just copy the scalar map values over to 2346 // the vector map. 2347 if (VF.isScalar()) { 2348 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2349 return ScalarValue; 2350 } 2351 2352 // Get the last scalar instruction we generated for V and Part. If the value 2353 // is known to be uniform after vectorization, this corresponds to lane zero 2354 // of the Part unroll iteration. Otherwise, the last instruction is the one 2355 // we created for the last vector lane of the Part unroll iteration. 2356 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2357 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2358 ? 0 2359 : VF.getKnownMinValue() - 1; 2360 auto *LastInst = cast<Instruction>( 2361 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2362 2363 // Set the insert point after the last scalarized instruction. This ensures 2364 // the insertelement sequence will directly follow the scalar definitions. 2365 auto OldIP = Builder.saveIP(); 2366 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2367 Builder.SetInsertPoint(&*NewIP); 2368 2369 // However, if we are vectorizing, we need to construct the vector values. 2370 // If the value is known to be uniform after vectorization, we can just 2371 // broadcast the scalar value corresponding to lane zero for each unroll 2372 // iteration. Otherwise, we construct the vector values using insertelement 2373 // instructions. Since the resulting vectors are stored in 2374 // VectorLoopValueMap, we will only generate the insertelements once. 2375 Value *VectorValue = nullptr; 2376 if (Cost->isUniformAfterVectorization(I, VF)) { 2377 VectorValue = getBroadcastInstrs(ScalarValue); 2378 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2379 } else { 2380 // Initialize packing with insertelements to start from undef. 2381 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2382 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2383 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2384 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2385 packScalarIntoVectorValue(V, {Part, Lane}); 2386 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2387 } 2388 Builder.restoreIP(OldIP); 2389 return VectorValue; 2390 } 2391 2392 // If this scalar is unknown, assume that it is a constant or that it is 2393 // loop invariant. Broadcast V and save the value for future uses. 2394 Value *B = getBroadcastInstrs(V); 2395 VectorLoopValueMap.setVectorValue(V, Part, B); 2396 return B; 2397 } 2398 2399 Value * 2400 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2401 const VPIteration &Instance) { 2402 // If the value is not an instruction contained in the loop, it should 2403 // already be scalar. 2404 if (OrigLoop->isLoopInvariant(V)) 2405 return V; 2406 2407 assert(Instance.Lane > 0 2408 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2409 : true && "Uniform values only have lane zero"); 2410 2411 // If the value from the original loop has not been vectorized, it is 2412 // represented by UF x VF scalar values in the new loop. Return the requested 2413 // scalar value. 2414 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2415 return VectorLoopValueMap.getScalarValue(V, Instance); 2416 2417 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2418 // for the given unroll part. If this entry is not a vector type (i.e., the 2419 // vectorization factor is one), there is no need to generate an 2420 // extractelement instruction. 2421 auto *U = getOrCreateVectorValue(V, Instance.Part); 2422 if (!U->getType()->isVectorTy()) { 2423 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2424 return U; 2425 } 2426 2427 // Otherwise, the value from the original loop has been vectorized and is 2428 // represented by UF vector values. Extract and return the requested scalar 2429 // value from the appropriate vector lane. 2430 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2431 } 2432 2433 void InnerLoopVectorizer::packScalarIntoVectorValue( 2434 Value *V, const VPIteration &Instance) { 2435 assert(V != Induction && "The new induction variable should not be used."); 2436 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2437 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2438 2439 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2440 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2441 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2442 Builder.getInt32(Instance.Lane)); 2443 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2444 } 2445 2446 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2447 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2448 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2449 SmallVector<int, 8> ShuffleMask; 2450 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2451 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2452 2453 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2454 } 2455 2456 // Return whether we allow using masked interleave-groups (for dealing with 2457 // strided loads/stores that reside in predicated blocks, or for dealing 2458 // with gaps). 2459 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2460 // If an override option has been passed in for interleaved accesses, use it. 2461 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2462 return EnableMaskedInterleavedMemAccesses; 2463 2464 return TTI.enableMaskedInterleavedAccessVectorization(); 2465 } 2466 2467 // Try to vectorize the interleave group that \p Instr belongs to. 2468 // 2469 // E.g. Translate following interleaved load group (factor = 3): 2470 // for (i = 0; i < N; i+=3) { 2471 // R = Pic[i]; // Member of index 0 2472 // G = Pic[i+1]; // Member of index 1 2473 // B = Pic[i+2]; // Member of index 2 2474 // ... // do something to R, G, B 2475 // } 2476 // To: 2477 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2478 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2479 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2480 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2481 // 2482 // Or translate following interleaved store group (factor = 3): 2483 // for (i = 0; i < N; i+=3) { 2484 // ... do something to R, G, B 2485 // Pic[i] = R; // Member of index 0 2486 // Pic[i+1] = G; // Member of index 1 2487 // Pic[i+2] = B; // Member of index 2 2488 // } 2489 // To: 2490 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2491 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2492 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2493 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2494 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2495 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2496 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2497 VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) { 2498 Instruction *Instr = Group->getInsertPos(); 2499 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2500 2501 // Prepare for the vector type of the interleaved load/store. 2502 Type *ScalarTy = getMemInstValueType(Instr); 2503 unsigned InterleaveFactor = Group->getFactor(); 2504 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2505 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2506 2507 // Prepare for the new pointers. 2508 SmallVector<Value *, 2> AddrParts; 2509 unsigned Index = Group->getIndex(Instr); 2510 2511 // TODO: extend the masked interleaved-group support to reversed access. 2512 assert((!BlockInMask || !Group->isReverse()) && 2513 "Reversed masked interleave-group not supported."); 2514 2515 // If the group is reverse, adjust the index to refer to the last vector lane 2516 // instead of the first. We adjust the index from the first vector lane, 2517 // rather than directly getting the pointer for lane VF - 1, because the 2518 // pointer operand of the interleaved access is supposed to be uniform. For 2519 // uniform instructions, we're only required to generate a value for the 2520 // first vector lane in each unroll iteration. 2521 assert(!VF.isScalable() && 2522 "scalable vector reverse operation is not implemented"); 2523 if (Group->isReverse()) 2524 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2525 2526 for (unsigned Part = 0; Part < UF; Part++) { 2527 Value *AddrPart = State.get(Addr, {Part, 0}); 2528 setDebugLocFromInst(Builder, AddrPart); 2529 2530 // Notice current instruction could be any index. Need to adjust the address 2531 // to the member of index 0. 2532 // 2533 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2534 // b = A[i]; // Member of index 0 2535 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2536 // 2537 // E.g. A[i+1] = a; // Member of index 1 2538 // A[i] = b; // Member of index 0 2539 // A[i+2] = c; // Member of index 2 (Current instruction) 2540 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2541 2542 bool InBounds = false; 2543 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2544 InBounds = gep->isInBounds(); 2545 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2546 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2547 2548 // Cast to the vector pointer type. 2549 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2550 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2551 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2552 } 2553 2554 setDebugLocFromInst(Builder, Instr); 2555 Value *UndefVec = UndefValue::get(VecTy); 2556 2557 Value *MaskForGaps = nullptr; 2558 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2559 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2560 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2561 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2562 } 2563 2564 // Vectorize the interleaved load group. 2565 if (isa<LoadInst>(Instr)) { 2566 // For each unroll part, create a wide load for the group. 2567 SmallVector<Value *, 2> NewLoads; 2568 for (unsigned Part = 0; Part < UF; Part++) { 2569 Instruction *NewLoad; 2570 if (BlockInMask || MaskForGaps) { 2571 assert(useMaskedInterleavedAccesses(*TTI) && 2572 "masked interleaved groups are not allowed."); 2573 Value *GroupMask = MaskForGaps; 2574 if (BlockInMask) { 2575 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2576 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2577 Value *ShuffledMask = Builder.CreateShuffleVector( 2578 BlockInMaskPart, 2579 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2580 "interleaved.mask"); 2581 GroupMask = MaskForGaps 2582 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2583 MaskForGaps) 2584 : ShuffledMask; 2585 } 2586 NewLoad = 2587 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2588 GroupMask, UndefVec, "wide.masked.vec"); 2589 } 2590 else 2591 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2592 Group->getAlign(), "wide.vec"); 2593 Group->addMetadata(NewLoad); 2594 NewLoads.push_back(NewLoad); 2595 } 2596 2597 // For each member in the group, shuffle out the appropriate data from the 2598 // wide loads. 2599 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2600 Instruction *Member = Group->getMember(I); 2601 2602 // Skip the gaps in the group. 2603 if (!Member) 2604 continue; 2605 2606 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2607 auto StrideMask = 2608 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2609 for (unsigned Part = 0; Part < UF; Part++) { 2610 Value *StridedVec = Builder.CreateShuffleVector( 2611 NewLoads[Part], StrideMask, "strided.vec"); 2612 2613 // If this member has different type, cast the result type. 2614 if (Member->getType() != ScalarTy) { 2615 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2616 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2617 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2618 } 2619 2620 if (Group->isReverse()) 2621 StridedVec = reverseVector(StridedVec); 2622 2623 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2624 } 2625 } 2626 return; 2627 } 2628 2629 // The sub vector type for current instruction. 2630 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2631 auto *SubVT = VectorType::get(ScalarTy, VF); 2632 2633 // Vectorize the interleaved store group. 2634 for (unsigned Part = 0; Part < UF; Part++) { 2635 // Collect the stored vector from each member. 2636 SmallVector<Value *, 4> StoredVecs; 2637 for (unsigned i = 0; i < InterleaveFactor; i++) { 2638 // Interleaved store group doesn't allow a gap, so each index has a member 2639 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2640 2641 Value *StoredVec = State.get(StoredValues[i], Part); 2642 2643 if (Group->isReverse()) 2644 StoredVec = reverseVector(StoredVec); 2645 2646 // If this member has different type, cast it to a unified type. 2647 2648 if (StoredVec->getType() != SubVT) 2649 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2650 2651 StoredVecs.push_back(StoredVec); 2652 } 2653 2654 // Concatenate all vectors into a wide vector. 2655 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2656 2657 // Interleave the elements in the wide vector. 2658 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2659 Value *IVec = Builder.CreateShuffleVector( 2660 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2661 "interleaved.vec"); 2662 2663 Instruction *NewStoreInstr; 2664 if (BlockInMask) { 2665 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2666 Value *ShuffledMask = Builder.CreateShuffleVector( 2667 BlockInMaskPart, 2668 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2669 "interleaved.mask"); 2670 NewStoreInstr = Builder.CreateMaskedStore( 2671 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2672 } 2673 else 2674 NewStoreInstr = 2675 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2676 2677 Group->addMetadata(NewStoreInstr); 2678 } 2679 } 2680 2681 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2682 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2683 VPValue *StoredValue, VPValue *BlockInMask) { 2684 // Attempt to issue a wide load. 2685 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2686 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2687 2688 assert((LI || SI) && "Invalid Load/Store instruction"); 2689 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2690 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2691 2692 LoopVectorizationCostModel::InstWidening Decision = 2693 Cost->getWideningDecision(Instr, VF); 2694 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2695 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2696 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2697 "CM decision is not to widen the memory instruction"); 2698 2699 Type *ScalarDataTy = getMemInstValueType(Instr); 2700 2701 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2702 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2703 const Align Alignment = getLoadStoreAlignment(Instr); 2704 2705 // Determine if the pointer operand of the access is either consecutive or 2706 // reverse consecutive. 2707 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2708 bool ConsecutiveStride = 2709 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2710 bool CreateGatherScatter = 2711 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2712 2713 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2714 // gather/scatter. Otherwise Decision should have been to Scalarize. 2715 assert((ConsecutiveStride || CreateGatherScatter) && 2716 "The instruction should be scalarized"); 2717 (void)ConsecutiveStride; 2718 2719 VectorParts BlockInMaskParts(UF); 2720 bool isMaskRequired = BlockInMask; 2721 if (isMaskRequired) 2722 for (unsigned Part = 0; Part < UF; ++Part) 2723 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2724 2725 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2726 // Calculate the pointer for the specific unroll-part. 2727 GetElementPtrInst *PartPtr = nullptr; 2728 2729 bool InBounds = false; 2730 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2731 InBounds = gep->isInBounds(); 2732 2733 if (Reverse) { 2734 // If the address is consecutive but reversed, then the 2735 // wide store needs to start at the last vector element. 2736 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2737 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2738 PartPtr->setIsInBounds(InBounds); 2739 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2740 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2741 PartPtr->setIsInBounds(InBounds); 2742 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2743 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2744 } else { 2745 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2746 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2747 PartPtr->setIsInBounds(InBounds); 2748 } 2749 2750 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2751 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2752 }; 2753 2754 // Handle Stores: 2755 if (SI) { 2756 setDebugLocFromInst(Builder, SI); 2757 2758 for (unsigned Part = 0; Part < UF; ++Part) { 2759 Instruction *NewSI = nullptr; 2760 Value *StoredVal = State.get(StoredValue, Part); 2761 if (CreateGatherScatter) { 2762 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2763 Value *VectorGep = State.get(Addr, Part); 2764 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2765 MaskPart); 2766 } else { 2767 if (Reverse) { 2768 // If we store to reverse consecutive memory locations, then we need 2769 // to reverse the order of elements in the stored value. 2770 StoredVal = reverseVector(StoredVal); 2771 // We don't want to update the value in the map as it might be used in 2772 // another expression. So don't call resetVectorValue(StoredVal). 2773 } 2774 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2775 if (isMaskRequired) 2776 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2777 BlockInMaskParts[Part]); 2778 else 2779 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2780 } 2781 addMetadata(NewSI, SI); 2782 } 2783 return; 2784 } 2785 2786 // Handle loads. 2787 assert(LI && "Must have a load instruction"); 2788 setDebugLocFromInst(Builder, LI); 2789 for (unsigned Part = 0; Part < UF; ++Part) { 2790 Value *NewLI; 2791 if (CreateGatherScatter) { 2792 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2793 Value *VectorGep = State.get(Addr, Part); 2794 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2795 nullptr, "wide.masked.gather"); 2796 addMetadata(NewLI, LI); 2797 } else { 2798 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2799 if (isMaskRequired) 2800 NewLI = Builder.CreateMaskedLoad( 2801 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2802 "wide.masked.load"); 2803 else 2804 NewLI = 2805 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2806 2807 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2808 addMetadata(NewLI, LI); 2809 if (Reverse) 2810 NewLI = reverseVector(NewLI); 2811 } 2812 2813 State.set(Def, Instr, NewLI, Part); 2814 } 2815 } 2816 2817 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2818 const VPIteration &Instance, 2819 bool IfPredicateInstr, 2820 VPTransformState &State) { 2821 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2822 2823 setDebugLocFromInst(Builder, Instr); 2824 2825 // Does this instruction return a value ? 2826 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2827 2828 Instruction *Cloned = Instr->clone(); 2829 if (!IsVoidRetTy) 2830 Cloned->setName(Instr->getName() + ".cloned"); 2831 2832 // Replace the operands of the cloned instructions with their scalar 2833 // equivalents in the new loop. 2834 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2835 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2836 auto InputInstance = Instance; 2837 if (!Operand || !OrigLoop->contains(Operand) || 2838 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2839 InputInstance.Lane = 0; 2840 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2841 Cloned->setOperand(op, NewOp); 2842 } 2843 addNewMetadata(Cloned, Instr); 2844 2845 // Place the cloned scalar in the new loop. 2846 Builder.Insert(Cloned); 2847 2848 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2849 // representing scalar values in VPTransformState. Add the cloned scalar to 2850 // the scalar map entry. 2851 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2852 2853 // If we just cloned a new assumption, add it the assumption cache. 2854 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2855 if (II->getIntrinsicID() == Intrinsic::assume) 2856 AC->registerAssumption(II); 2857 2858 // End if-block. 2859 if (IfPredicateInstr) 2860 PredicatedInstructions.push_back(Cloned); 2861 } 2862 2863 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2864 Value *End, Value *Step, 2865 Instruction *DL) { 2866 BasicBlock *Header = L->getHeader(); 2867 BasicBlock *Latch = L->getLoopLatch(); 2868 // As we're just creating this loop, it's possible no latch exists 2869 // yet. If so, use the header as this will be a single block loop. 2870 if (!Latch) 2871 Latch = Header; 2872 2873 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2874 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2875 setDebugLocFromInst(Builder, OldInst); 2876 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2877 2878 Builder.SetInsertPoint(Latch->getTerminator()); 2879 setDebugLocFromInst(Builder, OldInst); 2880 2881 // Create i+1 and fill the PHINode. 2882 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2883 Induction->addIncoming(Start, L->getLoopPreheader()); 2884 Induction->addIncoming(Next, Latch); 2885 // Create the compare. 2886 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2887 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2888 2889 // Now we have two terminators. Remove the old one from the block. 2890 Latch->getTerminator()->eraseFromParent(); 2891 2892 return Induction; 2893 } 2894 2895 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2896 if (TripCount) 2897 return TripCount; 2898 2899 assert(L && "Create Trip Count for null loop."); 2900 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2901 // Find the loop boundaries. 2902 ScalarEvolution *SE = PSE.getSE(); 2903 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2904 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2905 "Invalid loop count"); 2906 2907 Type *IdxTy = Legal->getWidestInductionType(); 2908 assert(IdxTy && "No type for induction"); 2909 2910 // The exit count might have the type of i64 while the phi is i32. This can 2911 // happen if we have an induction variable that is sign extended before the 2912 // compare. The only way that we get a backedge taken count is that the 2913 // induction variable was signed and as such will not overflow. In such a case 2914 // truncation is legal. 2915 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2916 IdxTy->getPrimitiveSizeInBits()) 2917 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2918 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2919 2920 // Get the total trip count from the count by adding 1. 2921 const SCEV *ExitCount = SE->getAddExpr( 2922 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2923 2924 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2925 2926 // Expand the trip count and place the new instructions in the preheader. 2927 // Notice that the pre-header does not change, only the loop body. 2928 SCEVExpander Exp(*SE, DL, "induction"); 2929 2930 // Count holds the overall loop count (N). 2931 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2932 L->getLoopPreheader()->getTerminator()); 2933 2934 if (TripCount->getType()->isPointerTy()) 2935 TripCount = 2936 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2937 L->getLoopPreheader()->getTerminator()); 2938 2939 return TripCount; 2940 } 2941 2942 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2943 if (VectorTripCount) 2944 return VectorTripCount; 2945 2946 Value *TC = getOrCreateTripCount(L); 2947 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2948 2949 Type *Ty = TC->getType(); 2950 // This is where we can make the step a runtime constant. 2951 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2952 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2953 2954 // If the tail is to be folded by masking, round the number of iterations N 2955 // up to a multiple of Step instead of rounding down. This is done by first 2956 // adding Step-1 and then rounding down. Note that it's ok if this addition 2957 // overflows: the vector induction variable will eventually wrap to zero given 2958 // that it starts at zero and its Step is a power of two; the loop will then 2959 // exit, with the last early-exit vector comparison also producing all-true. 2960 if (Cost->foldTailByMasking()) { 2961 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2962 "VF*UF must be a power of 2 when folding tail by masking"); 2963 TC = Builder.CreateAdd( 2964 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2965 } 2966 2967 // Now we need to generate the expression for the part of the loop that the 2968 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2969 // iterations are not required for correctness, or N - Step, otherwise. Step 2970 // is equal to the vectorization factor (number of SIMD elements) times the 2971 // unroll factor (number of SIMD instructions). 2972 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2973 2974 // If there is a non-reversed interleaved group that may speculatively access 2975 // memory out-of-bounds, we need to ensure that there will be at least one 2976 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2977 // the trip count, we set the remainder to be equal to the step. If the step 2978 // does not evenly divide the trip count, no adjustment is necessary since 2979 // there will already be scalar iterations. Note that the minimum iterations 2980 // check ensures that N >= Step. 2981 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2982 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2983 R = Builder.CreateSelect(IsZero, Step, R); 2984 } 2985 2986 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2987 2988 return VectorTripCount; 2989 } 2990 2991 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2992 const DataLayout &DL) { 2993 // Verify that V is a vector type with same number of elements as DstVTy. 2994 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2995 unsigned VF = DstFVTy->getNumElements(); 2996 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2997 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2998 Type *SrcElemTy = SrcVecTy->getElementType(); 2999 Type *DstElemTy = DstFVTy->getElementType(); 3000 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3001 "Vector elements must have same size"); 3002 3003 // Do a direct cast if element types are castable. 3004 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3005 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3006 } 3007 // V cannot be directly casted to desired vector type. 3008 // May happen when V is a floating point vector but DstVTy is a vector of 3009 // pointers or vice-versa. Handle this using a two-step bitcast using an 3010 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3011 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3012 "Only one type should be a pointer type"); 3013 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3014 "Only one type should be a floating point type"); 3015 Type *IntTy = 3016 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3017 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3018 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3019 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3020 } 3021 3022 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3023 BasicBlock *Bypass) { 3024 Value *Count = getOrCreateTripCount(L); 3025 // Reuse existing vector loop preheader for TC checks. 3026 // Note that new preheader block is generated for vector loop. 3027 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3028 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3029 3030 // Generate code to check if the loop's trip count is less than VF * UF, or 3031 // equal to it in case a scalar epilogue is required; this implies that the 3032 // vector trip count is zero. This check also covers the case where adding one 3033 // to the backedge-taken count overflowed leading to an incorrect trip count 3034 // of zero. In this case we will also jump to the scalar loop. 3035 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3036 : ICmpInst::ICMP_ULT; 3037 3038 // If tail is to be folded, vector loop takes care of all iterations. 3039 Value *CheckMinIters = Builder.getFalse(); 3040 if (!Cost->foldTailByMasking()) { 3041 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3042 CheckMinIters = Builder.CreateICmp( 3043 P, Count, 3044 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 3045 "min.iters.check"); 3046 } 3047 // Create new preheader for vector loop. 3048 LoopVectorPreHeader = 3049 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3050 "vector.ph"); 3051 3052 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3053 DT->getNode(Bypass)->getIDom()) && 3054 "TC check is expected to dominate Bypass"); 3055 3056 // Update dominator for Bypass & LoopExit. 3057 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3058 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3059 3060 ReplaceInstWithInst( 3061 TCCheckBlock->getTerminator(), 3062 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3063 LoopBypassBlocks.push_back(TCCheckBlock); 3064 } 3065 3066 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3067 // Reuse existing vector loop preheader for SCEV checks. 3068 // Note that new preheader block is generated for vector loop. 3069 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3070 3071 // Generate the code to check that the SCEV assumptions that we made. 3072 // We want the new basic block to start at the first instruction in a 3073 // sequence of instructions that form a check. 3074 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3075 "scev.check"); 3076 Value *SCEVCheck = Exp.expandCodeForPredicate( 3077 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3078 3079 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3080 if (C->isZero()) 3081 return; 3082 3083 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3084 (OptForSizeBasedOnProfile && 3085 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3086 "Cannot SCEV check stride or overflow when optimizing for size"); 3087 3088 SCEVCheckBlock->setName("vector.scevcheck"); 3089 // Create new preheader for vector loop. 3090 LoopVectorPreHeader = 3091 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3092 nullptr, "vector.ph"); 3093 3094 // Update dominator only if this is first RT check. 3095 if (LoopBypassBlocks.empty()) { 3096 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3097 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3098 } 3099 3100 ReplaceInstWithInst( 3101 SCEVCheckBlock->getTerminator(), 3102 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3103 LoopBypassBlocks.push_back(SCEVCheckBlock); 3104 AddedSafetyChecks = true; 3105 } 3106 3107 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3108 // VPlan-native path does not do any analysis for runtime checks currently. 3109 if (EnableVPlanNativePath) 3110 return; 3111 3112 // Reuse existing vector loop preheader for runtime memory checks. 3113 // Note that new preheader block is generated for vector loop. 3114 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3115 3116 // Generate the code that checks in runtime if arrays overlap. We put the 3117 // checks into a separate block to make the more common case of few elements 3118 // faster. 3119 auto *LAI = Legal->getLAI(); 3120 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3121 if (!RtPtrChecking.Need) 3122 return; 3123 3124 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3125 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3126 "Cannot emit memory checks when optimizing for size, unless forced " 3127 "to vectorize."); 3128 ORE->emit([&]() { 3129 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3130 L->getStartLoc(), L->getHeader()) 3131 << "Code-size may be reduced by not forcing " 3132 "vectorization, or by source-code modifications " 3133 "eliminating the need for runtime checks " 3134 "(e.g., adding 'restrict')."; 3135 }); 3136 } 3137 3138 MemCheckBlock->setName("vector.memcheck"); 3139 // Create new preheader for vector loop. 3140 LoopVectorPreHeader = 3141 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3142 "vector.ph"); 3143 3144 auto *CondBranch = cast<BranchInst>( 3145 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3146 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3147 LoopBypassBlocks.push_back(MemCheckBlock); 3148 AddedSafetyChecks = true; 3149 3150 // Update dominator only if this is first RT check. 3151 if (LoopBypassBlocks.empty()) { 3152 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3153 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3154 } 3155 3156 Instruction *FirstCheckInst; 3157 Instruction *MemRuntimeCheck; 3158 std::tie(FirstCheckInst, MemRuntimeCheck) = 3159 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3160 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3161 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3162 "claimed checks are required"); 3163 CondBranch->setCondition(MemRuntimeCheck); 3164 3165 // We currently don't use LoopVersioning for the actual loop cloning but we 3166 // still use it to add the noalias metadata. 3167 LVer = std::make_unique<LoopVersioning>( 3168 *Legal->getLAI(), 3169 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3170 DT, PSE.getSE()); 3171 LVer->prepareNoAliasMetadata(); 3172 } 3173 3174 Value *InnerLoopVectorizer::emitTransformedIndex( 3175 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3176 const InductionDescriptor &ID) const { 3177 3178 SCEVExpander Exp(*SE, DL, "induction"); 3179 auto Step = ID.getStep(); 3180 auto StartValue = ID.getStartValue(); 3181 assert(Index->getType() == Step->getType() && 3182 "Index type does not match StepValue type"); 3183 3184 // Note: the IR at this point is broken. We cannot use SE to create any new 3185 // SCEV and then expand it, hoping that SCEV's simplification will give us 3186 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3187 // lead to various SCEV crashes. So all we can do is to use builder and rely 3188 // on InstCombine for future simplifications. Here we handle some trivial 3189 // cases only. 3190 auto CreateAdd = [&B](Value *X, Value *Y) { 3191 assert(X->getType() == Y->getType() && "Types don't match!"); 3192 if (auto *CX = dyn_cast<ConstantInt>(X)) 3193 if (CX->isZero()) 3194 return Y; 3195 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3196 if (CY->isZero()) 3197 return X; 3198 return B.CreateAdd(X, Y); 3199 }; 3200 3201 auto CreateMul = [&B](Value *X, Value *Y) { 3202 assert(X->getType() == Y->getType() && "Types don't match!"); 3203 if (auto *CX = dyn_cast<ConstantInt>(X)) 3204 if (CX->isOne()) 3205 return Y; 3206 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3207 if (CY->isOne()) 3208 return X; 3209 return B.CreateMul(X, Y); 3210 }; 3211 3212 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3213 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3214 // the DomTree is not kept up-to-date for additional blocks generated in the 3215 // vector loop. By using the header as insertion point, we guarantee that the 3216 // expanded instructions dominate all their uses. 3217 auto GetInsertPoint = [this, &B]() { 3218 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3219 if (InsertBB != LoopVectorBody && 3220 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3221 return LoopVectorBody->getTerminator(); 3222 return &*B.GetInsertPoint(); 3223 }; 3224 switch (ID.getKind()) { 3225 case InductionDescriptor::IK_IntInduction: { 3226 assert(Index->getType() == StartValue->getType() && 3227 "Index type does not match StartValue type"); 3228 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3229 return B.CreateSub(StartValue, Index); 3230 auto *Offset = CreateMul( 3231 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3232 return CreateAdd(StartValue, Offset); 3233 } 3234 case InductionDescriptor::IK_PtrInduction: { 3235 assert(isa<SCEVConstant>(Step) && 3236 "Expected constant step for pointer induction"); 3237 return B.CreateGEP( 3238 StartValue->getType()->getPointerElementType(), StartValue, 3239 CreateMul(Index, 3240 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3241 } 3242 case InductionDescriptor::IK_FpInduction: { 3243 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3244 auto InductionBinOp = ID.getInductionBinOp(); 3245 assert(InductionBinOp && 3246 (InductionBinOp->getOpcode() == Instruction::FAdd || 3247 InductionBinOp->getOpcode() == Instruction::FSub) && 3248 "Original bin op should be defined for FP induction"); 3249 3250 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3251 3252 // Floating point operations had to be 'fast' to enable the induction. 3253 FastMathFlags Flags; 3254 Flags.setFast(); 3255 3256 Value *MulExp = B.CreateFMul(StepValue, Index); 3257 if (isa<Instruction>(MulExp)) 3258 // We have to check, the MulExp may be a constant. 3259 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3260 3261 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3262 "induction"); 3263 if (isa<Instruction>(BOp)) 3264 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3265 3266 return BOp; 3267 } 3268 case InductionDescriptor::IK_NoInduction: 3269 return nullptr; 3270 } 3271 llvm_unreachable("invalid enum"); 3272 } 3273 3274 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3275 LoopScalarBody = OrigLoop->getHeader(); 3276 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3277 LoopExitBlock = OrigLoop->getExitBlock(); 3278 assert(LoopExitBlock && "Must have an exit block"); 3279 assert(LoopVectorPreHeader && "Invalid loop structure"); 3280 3281 LoopMiddleBlock = 3282 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3283 LI, nullptr, Twine(Prefix) + "middle.block"); 3284 LoopScalarPreHeader = 3285 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3286 nullptr, Twine(Prefix) + "scalar.ph"); 3287 // We intentionally don't let SplitBlock to update LoopInfo since 3288 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3289 // LoopVectorBody is explicitly added to the correct place few lines later. 3290 LoopVectorBody = 3291 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3292 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3293 3294 // Update dominator for loop exit. 3295 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3296 3297 // Create and register the new vector loop. 3298 Loop *Lp = LI->AllocateLoop(); 3299 Loop *ParentLoop = OrigLoop->getParentLoop(); 3300 3301 // Insert the new loop into the loop nest and register the new basic blocks 3302 // before calling any utilities such as SCEV that require valid LoopInfo. 3303 if (ParentLoop) { 3304 ParentLoop->addChildLoop(Lp); 3305 } else { 3306 LI->addTopLevelLoop(Lp); 3307 } 3308 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3309 return Lp; 3310 } 3311 3312 void InnerLoopVectorizer::createInductionResumeValues( 3313 Loop *L, Value *VectorTripCount, 3314 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3315 assert(VectorTripCount && L && "Expected valid arguments"); 3316 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3317 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3318 "Inconsistent information about additional bypass."); 3319 // We are going to resume the execution of the scalar loop. 3320 // Go over all of the induction variables that we found and fix the 3321 // PHIs that are left in the scalar version of the loop. 3322 // The starting values of PHI nodes depend on the counter of the last 3323 // iteration in the vectorized loop. 3324 // If we come from a bypass edge then we need to start from the original 3325 // start value. 3326 for (auto &InductionEntry : Legal->getInductionVars()) { 3327 PHINode *OrigPhi = InductionEntry.first; 3328 InductionDescriptor II = InductionEntry.second; 3329 3330 // Create phi nodes to merge from the backedge-taken check block. 3331 PHINode *BCResumeVal = 3332 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3333 LoopScalarPreHeader->getTerminator()); 3334 // Copy original phi DL over to the new one. 3335 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3336 Value *&EndValue = IVEndValues[OrigPhi]; 3337 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3338 if (OrigPhi == OldInduction) { 3339 // We know what the end value is. 3340 EndValue = VectorTripCount; 3341 } else { 3342 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3343 Type *StepType = II.getStep()->getType(); 3344 Instruction::CastOps CastOp = 3345 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3346 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3347 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3348 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3349 EndValue->setName("ind.end"); 3350 3351 // Compute the end value for the additional bypass (if applicable). 3352 if (AdditionalBypass.first) { 3353 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3354 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3355 StepType, true); 3356 CRD = 3357 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3358 EndValueFromAdditionalBypass = 3359 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3360 EndValueFromAdditionalBypass->setName("ind.end"); 3361 } 3362 } 3363 // The new PHI merges the original incoming value, in case of a bypass, 3364 // or the value at the end of the vectorized loop. 3365 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3366 3367 // Fix the scalar body counter (PHI node). 3368 // The old induction's phi node in the scalar body needs the truncated 3369 // value. 3370 for (BasicBlock *BB : LoopBypassBlocks) 3371 BCResumeVal->addIncoming(II.getStartValue(), BB); 3372 3373 if (AdditionalBypass.first) 3374 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3375 EndValueFromAdditionalBypass); 3376 3377 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3378 } 3379 } 3380 3381 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3382 MDNode *OrigLoopID) { 3383 assert(L && "Expected valid loop."); 3384 3385 // The trip counts should be cached by now. 3386 Value *Count = getOrCreateTripCount(L); 3387 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3388 3389 // We need the OrigLoop (scalar loop part) latch terminator to help 3390 // produce correct debug info for the middle block BB instructions. 3391 // The legality check stage guarantees that the loop will have a single 3392 // latch. 3393 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3394 "Scalar loop latch terminator isn't a branch"); 3395 BranchInst *ScalarLatchBr = 3396 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3397 3398 // Add a check in the middle block to see if we have completed 3399 // all of the iterations in the first vector loop. 3400 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3401 // If tail is to be folded, we know we don't need to run the remainder. 3402 Value *CmpN = Builder.getTrue(); 3403 if (!Cost->foldTailByMasking()) { 3404 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3405 VectorTripCount, "cmp.n", 3406 LoopMiddleBlock->getTerminator()); 3407 3408 // Here we use the same DebugLoc as the scalar loop latch branch instead 3409 // of the corresponding compare because they may have ended up with 3410 // different line numbers and we want to avoid awkward line stepping while 3411 // debugging. Eg. if the compare has got a line number inside the loop. 3412 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3413 } 3414 3415 BranchInst *BrInst = 3416 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3417 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3418 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3419 3420 // Get ready to start creating new instructions into the vectorized body. 3421 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3422 "Inconsistent vector loop preheader"); 3423 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3424 3425 Optional<MDNode *> VectorizedLoopID = 3426 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3427 LLVMLoopVectorizeFollowupVectorized}); 3428 if (VectorizedLoopID.hasValue()) { 3429 L->setLoopID(VectorizedLoopID.getValue()); 3430 3431 // Do not setAlreadyVectorized if loop attributes have been defined 3432 // explicitly. 3433 return LoopVectorPreHeader; 3434 } 3435 3436 // Keep all loop hints from the original loop on the vector loop (we'll 3437 // replace the vectorizer-specific hints below). 3438 if (MDNode *LID = OrigLoop->getLoopID()) 3439 L->setLoopID(LID); 3440 3441 LoopVectorizeHints Hints(L, true, *ORE); 3442 Hints.setAlreadyVectorized(); 3443 3444 #ifdef EXPENSIVE_CHECKS 3445 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3446 LI->verify(*DT); 3447 #endif 3448 3449 return LoopVectorPreHeader; 3450 } 3451 3452 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3453 /* 3454 In this function we generate a new loop. The new loop will contain 3455 the vectorized instructions while the old loop will continue to run the 3456 scalar remainder. 3457 3458 [ ] <-- loop iteration number check. 3459 / | 3460 / v 3461 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3462 | / | 3463 | / v 3464 || [ ] <-- vector pre header. 3465 |/ | 3466 | v 3467 | [ ] \ 3468 | [ ]_| <-- vector loop. 3469 | | 3470 | v 3471 | -[ ] <--- middle-block. 3472 | / | 3473 | / v 3474 -|- >[ ] <--- new preheader. 3475 | | 3476 | v 3477 | [ ] \ 3478 | [ ]_| <-- old scalar loop to handle remainder. 3479 \ | 3480 \ v 3481 >[ ] <-- exit block. 3482 ... 3483 */ 3484 3485 // Get the metadata of the original loop before it gets modified. 3486 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3487 3488 // Create an empty vector loop, and prepare basic blocks for the runtime 3489 // checks. 3490 Loop *Lp = createVectorLoopSkeleton(""); 3491 3492 // Now, compare the new count to zero. If it is zero skip the vector loop and 3493 // jump to the scalar loop. This check also covers the case where the 3494 // backedge-taken count is uint##_max: adding one to it will overflow leading 3495 // to an incorrect trip count of zero. In this (rare) case we will also jump 3496 // to the scalar loop. 3497 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3498 3499 // Generate the code to check any assumptions that we've made for SCEV 3500 // expressions. 3501 emitSCEVChecks(Lp, LoopScalarPreHeader); 3502 3503 // Generate the code that checks in runtime if arrays overlap. We put the 3504 // checks into a separate block to make the more common case of few elements 3505 // faster. 3506 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3507 3508 // Some loops have a single integer induction variable, while other loops 3509 // don't. One example is c++ iterators that often have multiple pointer 3510 // induction variables. In the code below we also support a case where we 3511 // don't have a single induction variable. 3512 // 3513 // We try to obtain an induction variable from the original loop as hard 3514 // as possible. However if we don't find one that: 3515 // - is an integer 3516 // - counts from zero, stepping by one 3517 // - is the size of the widest induction variable type 3518 // then we create a new one. 3519 OldInduction = Legal->getPrimaryInduction(); 3520 Type *IdxTy = Legal->getWidestInductionType(); 3521 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3522 // The loop step is equal to the vectorization factor (num of SIMD elements) 3523 // times the unroll factor (num of SIMD instructions). 3524 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3525 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3526 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3527 Induction = 3528 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3529 getDebugLocFromInstOrOperands(OldInduction)); 3530 3531 // Emit phis for the new starting index of the scalar loop. 3532 createInductionResumeValues(Lp, CountRoundDown); 3533 3534 return completeLoopSkeleton(Lp, OrigLoopID); 3535 } 3536 3537 // Fix up external users of the induction variable. At this point, we are 3538 // in LCSSA form, with all external PHIs that use the IV having one input value, 3539 // coming from the remainder loop. We need those PHIs to also have a correct 3540 // value for the IV when arriving directly from the middle block. 3541 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3542 const InductionDescriptor &II, 3543 Value *CountRoundDown, Value *EndValue, 3544 BasicBlock *MiddleBlock) { 3545 // There are two kinds of external IV usages - those that use the value 3546 // computed in the last iteration (the PHI) and those that use the penultimate 3547 // value (the value that feeds into the phi from the loop latch). 3548 // We allow both, but they, obviously, have different values. 3549 3550 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3551 3552 DenseMap<Value *, Value *> MissingVals; 3553 3554 // An external user of the last iteration's value should see the value that 3555 // the remainder loop uses to initialize its own IV. 3556 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3557 for (User *U : PostInc->users()) { 3558 Instruction *UI = cast<Instruction>(U); 3559 if (!OrigLoop->contains(UI)) { 3560 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3561 MissingVals[UI] = EndValue; 3562 } 3563 } 3564 3565 // An external user of the penultimate value need to see EndValue - Step. 3566 // The simplest way to get this is to recompute it from the constituent SCEVs, 3567 // that is Start + (Step * (CRD - 1)). 3568 for (User *U : OrigPhi->users()) { 3569 auto *UI = cast<Instruction>(U); 3570 if (!OrigLoop->contains(UI)) { 3571 const DataLayout &DL = 3572 OrigLoop->getHeader()->getModule()->getDataLayout(); 3573 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3574 3575 IRBuilder<> B(MiddleBlock->getTerminator()); 3576 Value *CountMinusOne = B.CreateSub( 3577 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3578 Value *CMO = 3579 !II.getStep()->getType()->isIntegerTy() 3580 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3581 II.getStep()->getType()) 3582 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3583 CMO->setName("cast.cmo"); 3584 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3585 Escape->setName("ind.escape"); 3586 MissingVals[UI] = Escape; 3587 } 3588 } 3589 3590 for (auto &I : MissingVals) { 3591 PHINode *PHI = cast<PHINode>(I.first); 3592 // One corner case we have to handle is two IVs "chasing" each-other, 3593 // that is %IV2 = phi [...], [ %IV1, %latch ] 3594 // In this case, if IV1 has an external use, we need to avoid adding both 3595 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3596 // don't already have an incoming value for the middle block. 3597 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3598 PHI->addIncoming(I.second, MiddleBlock); 3599 } 3600 } 3601 3602 namespace { 3603 3604 struct CSEDenseMapInfo { 3605 static bool canHandle(const Instruction *I) { 3606 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3607 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3608 } 3609 3610 static inline Instruction *getEmptyKey() { 3611 return DenseMapInfo<Instruction *>::getEmptyKey(); 3612 } 3613 3614 static inline Instruction *getTombstoneKey() { 3615 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3616 } 3617 3618 static unsigned getHashValue(const Instruction *I) { 3619 assert(canHandle(I) && "Unknown instruction!"); 3620 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3621 I->value_op_end())); 3622 } 3623 3624 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3625 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3626 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3627 return LHS == RHS; 3628 return LHS->isIdenticalTo(RHS); 3629 } 3630 }; 3631 3632 } // end anonymous namespace 3633 3634 ///Perform cse of induction variable instructions. 3635 static void cse(BasicBlock *BB) { 3636 // Perform simple cse. 3637 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3638 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3639 Instruction *In = &*I++; 3640 3641 if (!CSEDenseMapInfo::canHandle(In)) 3642 continue; 3643 3644 // Check if we can replace this instruction with any of the 3645 // visited instructions. 3646 if (Instruction *V = CSEMap.lookup(In)) { 3647 In->replaceAllUsesWith(V); 3648 In->eraseFromParent(); 3649 continue; 3650 } 3651 3652 CSEMap[In] = In; 3653 } 3654 } 3655 3656 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3657 ElementCount VF, 3658 bool &NeedToScalarize) { 3659 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3660 Function *F = CI->getCalledFunction(); 3661 Type *ScalarRetTy = CI->getType(); 3662 SmallVector<Type *, 4> Tys, ScalarTys; 3663 for (auto &ArgOp : CI->arg_operands()) 3664 ScalarTys.push_back(ArgOp->getType()); 3665 3666 // Estimate cost of scalarized vector call. The source operands are assumed 3667 // to be vectors, so we need to extract individual elements from there, 3668 // execute VF scalar calls, and then gather the result into the vector return 3669 // value. 3670 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3671 TTI::TCK_RecipThroughput); 3672 if (VF.isScalar()) 3673 return ScalarCallCost; 3674 3675 // Compute corresponding vector type for return value and arguments. 3676 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3677 for (Type *ScalarTy : ScalarTys) 3678 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3679 3680 // Compute costs of unpacking argument values for the scalar calls and 3681 // packing the return values to a vector. 3682 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3683 3684 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3685 3686 // If we can't emit a vector call for this function, then the currently found 3687 // cost is the cost we need to return. 3688 NeedToScalarize = true; 3689 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3690 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3691 3692 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3693 return Cost; 3694 3695 // If the corresponding vector cost is cheaper, return its cost. 3696 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3697 TTI::TCK_RecipThroughput); 3698 if (VectorCallCost < Cost) { 3699 NeedToScalarize = false; 3700 return VectorCallCost; 3701 } 3702 return Cost; 3703 } 3704 3705 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3706 ElementCount VF) { 3707 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3708 assert(ID && "Expected intrinsic call!"); 3709 3710 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3711 return TTI.getIntrinsicInstrCost(CostAttrs, 3712 TargetTransformInfo::TCK_RecipThroughput); 3713 } 3714 3715 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3716 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3717 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3718 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3719 } 3720 3721 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3722 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3723 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3724 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3725 } 3726 3727 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3728 // For every instruction `I` in MinBWs, truncate the operands, create a 3729 // truncated version of `I` and reextend its result. InstCombine runs 3730 // later and will remove any ext/trunc pairs. 3731 SmallPtrSet<Value *, 4> Erased; 3732 for (const auto &KV : Cost->getMinimalBitwidths()) { 3733 // If the value wasn't vectorized, we must maintain the original scalar 3734 // type. The absence of the value from VectorLoopValueMap indicates that it 3735 // wasn't vectorized. 3736 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3737 continue; 3738 for (unsigned Part = 0; Part < UF; ++Part) { 3739 Value *I = getOrCreateVectorValue(KV.first, Part); 3740 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3741 continue; 3742 Type *OriginalTy = I->getType(); 3743 Type *ScalarTruncatedTy = 3744 IntegerType::get(OriginalTy->getContext(), KV.second); 3745 auto *TruncatedTy = FixedVectorType::get( 3746 ScalarTruncatedTy, 3747 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3748 if (TruncatedTy == OriginalTy) 3749 continue; 3750 3751 IRBuilder<> B(cast<Instruction>(I)); 3752 auto ShrinkOperand = [&](Value *V) -> Value * { 3753 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3754 if (ZI->getSrcTy() == TruncatedTy) 3755 return ZI->getOperand(0); 3756 return B.CreateZExtOrTrunc(V, TruncatedTy); 3757 }; 3758 3759 // The actual instruction modification depends on the instruction type, 3760 // unfortunately. 3761 Value *NewI = nullptr; 3762 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3763 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3764 ShrinkOperand(BO->getOperand(1))); 3765 3766 // Any wrapping introduced by shrinking this operation shouldn't be 3767 // considered undefined behavior. So, we can't unconditionally copy 3768 // arithmetic wrapping flags to NewI. 3769 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3770 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3771 NewI = 3772 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3773 ShrinkOperand(CI->getOperand(1))); 3774 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3775 NewI = B.CreateSelect(SI->getCondition(), 3776 ShrinkOperand(SI->getTrueValue()), 3777 ShrinkOperand(SI->getFalseValue())); 3778 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3779 switch (CI->getOpcode()) { 3780 default: 3781 llvm_unreachable("Unhandled cast!"); 3782 case Instruction::Trunc: 3783 NewI = ShrinkOperand(CI->getOperand(0)); 3784 break; 3785 case Instruction::SExt: 3786 NewI = B.CreateSExtOrTrunc( 3787 CI->getOperand(0), 3788 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3789 break; 3790 case Instruction::ZExt: 3791 NewI = B.CreateZExtOrTrunc( 3792 CI->getOperand(0), 3793 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3794 break; 3795 } 3796 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3797 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3798 ->getNumElements(); 3799 auto *O0 = B.CreateZExtOrTrunc( 3800 SI->getOperand(0), 3801 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3802 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3803 ->getNumElements(); 3804 auto *O1 = B.CreateZExtOrTrunc( 3805 SI->getOperand(1), 3806 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3807 3808 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3809 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3810 // Don't do anything with the operands, just extend the result. 3811 continue; 3812 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3813 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3814 ->getNumElements(); 3815 auto *O0 = B.CreateZExtOrTrunc( 3816 IE->getOperand(0), 3817 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3818 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3819 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3820 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3821 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3822 ->getNumElements(); 3823 auto *O0 = B.CreateZExtOrTrunc( 3824 EE->getOperand(0), 3825 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3826 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3827 } else { 3828 // If we don't know what to do, be conservative and don't do anything. 3829 continue; 3830 } 3831 3832 // Lastly, extend the result. 3833 NewI->takeName(cast<Instruction>(I)); 3834 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3835 I->replaceAllUsesWith(Res); 3836 cast<Instruction>(I)->eraseFromParent(); 3837 Erased.insert(I); 3838 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3839 } 3840 } 3841 3842 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3843 for (const auto &KV : Cost->getMinimalBitwidths()) { 3844 // If the value wasn't vectorized, we must maintain the original scalar 3845 // type. The absence of the value from VectorLoopValueMap indicates that it 3846 // wasn't vectorized. 3847 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3848 continue; 3849 for (unsigned Part = 0; Part < UF; ++Part) { 3850 Value *I = getOrCreateVectorValue(KV.first, Part); 3851 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3852 if (Inst && Inst->use_empty()) { 3853 Value *NewI = Inst->getOperand(0); 3854 Inst->eraseFromParent(); 3855 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3856 } 3857 } 3858 } 3859 } 3860 3861 void InnerLoopVectorizer::fixVectorizedLoop() { 3862 // Insert truncates and extends for any truncated instructions as hints to 3863 // InstCombine. 3864 if (VF.isVector()) 3865 truncateToMinimalBitwidths(); 3866 3867 // Fix widened non-induction PHIs by setting up the PHI operands. 3868 if (OrigPHIsToFix.size()) { 3869 assert(EnableVPlanNativePath && 3870 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3871 fixNonInductionPHIs(); 3872 } 3873 3874 // At this point every instruction in the original loop is widened to a 3875 // vector form. Now we need to fix the recurrences in the loop. These PHI 3876 // nodes are currently empty because we did not want to introduce cycles. 3877 // This is the second stage of vectorizing recurrences. 3878 fixCrossIterationPHIs(); 3879 3880 // Forget the original basic block. 3881 PSE.getSE()->forgetLoop(OrigLoop); 3882 3883 // Fix-up external users of the induction variables. 3884 for (auto &Entry : Legal->getInductionVars()) 3885 fixupIVUsers(Entry.first, Entry.second, 3886 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3887 IVEndValues[Entry.first], LoopMiddleBlock); 3888 3889 fixLCSSAPHIs(); 3890 for (Instruction *PI : PredicatedInstructions) 3891 sinkScalarOperands(&*PI); 3892 3893 // Remove redundant induction instructions. 3894 cse(LoopVectorBody); 3895 3896 // Set/update profile weights for the vector and remainder loops as original 3897 // loop iterations are now distributed among them. Note that original loop 3898 // represented by LoopScalarBody becomes remainder loop after vectorization. 3899 // 3900 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3901 // end up getting slightly roughened result but that should be OK since 3902 // profile is not inherently precise anyway. Note also possible bypass of 3903 // vector code caused by legality checks is ignored, assigning all the weight 3904 // to the vector loop, optimistically. 3905 assert(!VF.isScalable() && 3906 "cannot use scalable ElementCount to determine unroll factor"); 3907 setProfileInfoAfterUnrolling( 3908 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3909 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3910 } 3911 3912 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3913 // In order to support recurrences we need to be able to vectorize Phi nodes. 3914 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3915 // stage #2: We now need to fix the recurrences by adding incoming edges to 3916 // the currently empty PHI nodes. At this point every instruction in the 3917 // original loop is widened to a vector form so we can use them to construct 3918 // the incoming edges. 3919 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3920 // Handle first-order recurrences and reductions that need to be fixed. 3921 if (Legal->isFirstOrderRecurrence(&Phi)) 3922 fixFirstOrderRecurrence(&Phi); 3923 else if (Legal->isReductionVariable(&Phi)) 3924 fixReduction(&Phi); 3925 } 3926 } 3927 3928 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3929 // This is the second phase of vectorizing first-order recurrences. An 3930 // overview of the transformation is described below. Suppose we have the 3931 // following loop. 3932 // 3933 // for (int i = 0; i < n; ++i) 3934 // b[i] = a[i] - a[i - 1]; 3935 // 3936 // There is a first-order recurrence on "a". For this loop, the shorthand 3937 // scalar IR looks like: 3938 // 3939 // scalar.ph: 3940 // s_init = a[-1] 3941 // br scalar.body 3942 // 3943 // scalar.body: 3944 // i = phi [0, scalar.ph], [i+1, scalar.body] 3945 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3946 // s2 = a[i] 3947 // b[i] = s2 - s1 3948 // br cond, scalar.body, ... 3949 // 3950 // In this example, s1 is a recurrence because it's value depends on the 3951 // previous iteration. In the first phase of vectorization, we created a 3952 // temporary value for s1. We now complete the vectorization and produce the 3953 // shorthand vector IR shown below (for VF = 4, UF = 1). 3954 // 3955 // vector.ph: 3956 // v_init = vector(..., ..., ..., a[-1]) 3957 // br vector.body 3958 // 3959 // vector.body 3960 // i = phi [0, vector.ph], [i+4, vector.body] 3961 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3962 // v2 = a[i, i+1, i+2, i+3]; 3963 // v3 = vector(v1(3), v2(0, 1, 2)) 3964 // b[i, i+1, i+2, i+3] = v2 - v3 3965 // br cond, vector.body, middle.block 3966 // 3967 // middle.block: 3968 // x = v2(3) 3969 // br scalar.ph 3970 // 3971 // scalar.ph: 3972 // s_init = phi [x, middle.block], [a[-1], otherwise] 3973 // br scalar.body 3974 // 3975 // After execution completes the vector loop, we extract the next value of 3976 // the recurrence (x) to use as the initial value in the scalar loop. 3977 3978 // Get the original loop preheader and single loop latch. 3979 auto *Preheader = OrigLoop->getLoopPreheader(); 3980 auto *Latch = OrigLoop->getLoopLatch(); 3981 3982 // Get the initial and previous values of the scalar recurrence. 3983 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3984 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3985 3986 // Create a vector from the initial value. 3987 auto *VectorInit = ScalarInit; 3988 if (VF.isVector()) { 3989 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3990 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3991 VectorInit = Builder.CreateInsertElement( 3992 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3993 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3994 } 3995 3996 // We constructed a temporary phi node in the first phase of vectorization. 3997 // This phi node will eventually be deleted. 3998 Builder.SetInsertPoint( 3999 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4000 4001 // Create a phi node for the new recurrence. The current value will either be 4002 // the initial value inserted into a vector or loop-varying vector value. 4003 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4004 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4005 4006 // Get the vectorized previous value of the last part UF - 1. It appears last 4007 // among all unrolled iterations, due to the order of their construction. 4008 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4009 4010 // Find and set the insertion point after the previous value if it is an 4011 // instruction. 4012 BasicBlock::iterator InsertPt; 4013 // Note that the previous value may have been constant-folded so it is not 4014 // guaranteed to be an instruction in the vector loop. 4015 // FIXME: Loop invariant values do not form recurrences. We should deal with 4016 // them earlier. 4017 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4018 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4019 else { 4020 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4021 if (isa<PHINode>(PreviousLastPart)) 4022 // If the previous value is a phi node, we should insert after all the phi 4023 // nodes in the block containing the PHI to avoid breaking basic block 4024 // verification. Note that the basic block may be different to 4025 // LoopVectorBody, in case we predicate the loop. 4026 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4027 else 4028 InsertPt = ++PreviousInst->getIterator(); 4029 } 4030 Builder.SetInsertPoint(&*InsertPt); 4031 4032 // We will construct a vector for the recurrence by combining the values for 4033 // the current and previous iterations. This is the required shuffle mask. 4034 assert(!VF.isScalable()); 4035 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4036 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4037 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4038 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4039 4040 // The vector from which to take the initial value for the current iteration 4041 // (actual or unrolled). Initially, this is the vector phi node. 4042 Value *Incoming = VecPhi; 4043 4044 // Shuffle the current and previous vector and update the vector parts. 4045 for (unsigned Part = 0; Part < UF; ++Part) { 4046 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4047 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4048 auto *Shuffle = 4049 VF.isVector() 4050 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4051 : Incoming; 4052 PhiPart->replaceAllUsesWith(Shuffle); 4053 cast<Instruction>(PhiPart)->eraseFromParent(); 4054 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4055 Incoming = PreviousPart; 4056 } 4057 4058 // Fix the latch value of the new recurrence in the vector loop. 4059 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4060 4061 // Extract the last vector element in the middle block. This will be the 4062 // initial value for the recurrence when jumping to the scalar loop. 4063 auto *ExtractForScalar = Incoming; 4064 if (VF.isVector()) { 4065 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4066 ExtractForScalar = Builder.CreateExtractElement( 4067 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4068 "vector.recur.extract"); 4069 } 4070 // Extract the second last element in the middle block if the 4071 // Phi is used outside the loop. We need to extract the phi itself 4072 // and not the last element (the phi update in the current iteration). This 4073 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4074 // when the scalar loop is not run at all. 4075 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4076 if (VF.isVector()) 4077 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4078 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4079 "vector.recur.extract.for.phi"); 4080 // When loop is unrolled without vectorizing, initialize 4081 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4082 // `Incoming`. This is analogous to the vectorized case above: extracting the 4083 // second last element when VF > 1. 4084 else if (UF > 1) 4085 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4086 4087 // Fix the initial value of the original recurrence in the scalar loop. 4088 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4089 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4090 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4091 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4092 Start->addIncoming(Incoming, BB); 4093 } 4094 4095 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4096 Phi->setName("scalar.recur"); 4097 4098 // Finally, fix users of the recurrence outside the loop. The users will need 4099 // either the last value of the scalar recurrence or the last value of the 4100 // vector recurrence we extracted in the middle block. Since the loop is in 4101 // LCSSA form, we just need to find all the phi nodes for the original scalar 4102 // recurrence in the exit block, and then add an edge for the middle block. 4103 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4104 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4105 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4106 } 4107 } 4108 } 4109 4110 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4111 Constant *Zero = Builder.getInt32(0); 4112 4113 // Get it's reduction variable descriptor. 4114 assert(Legal->isReductionVariable(Phi) && 4115 "Unable to find the reduction variable"); 4116 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4117 4118 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4119 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4120 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4121 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 4122 RdxDesc.getMinMaxRecurrenceKind(); 4123 setDebugLocFromInst(Builder, ReductionStartValue); 4124 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4125 4126 // We need to generate a reduction vector from the incoming scalar. 4127 // To do so, we need to generate the 'identity' vector and override 4128 // one of the elements with the incoming scalar reduction. We need 4129 // to do it in the vector-loop preheader. 4130 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4131 4132 // This is the vector-clone of the value that leaves the loop. 4133 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4134 4135 // Find the reduction identity variable. Zero for addition, or, xor, 4136 // one for multiplication, -1 for And. 4137 Value *Identity; 4138 Value *VectorStart; 4139 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 4140 RK == RecurrenceDescriptor::RK_FloatMinMax) { 4141 // MinMax reduction have the start value as their identify. 4142 if (VF.isScalar() || IsInLoopReductionPhi) { 4143 VectorStart = Identity = ReductionStartValue; 4144 } else { 4145 VectorStart = Identity = 4146 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4147 } 4148 } else { 4149 // Handle other reduction kinds: 4150 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4151 RK, MinMaxKind, VecTy->getScalarType()); 4152 if (VF.isScalar() || IsInLoopReductionPhi) { 4153 Identity = Iden; 4154 // This vector is the Identity vector where the first element is the 4155 // incoming scalar reduction. 4156 VectorStart = ReductionStartValue; 4157 } else { 4158 Identity = ConstantVector::getSplat(VF, Iden); 4159 4160 // This vector is the Identity vector where the first element is the 4161 // incoming scalar reduction. 4162 VectorStart = 4163 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4164 } 4165 } 4166 4167 // Wrap flags are in general invalid after vectorization, clear them. 4168 clearReductionWrapFlags(RdxDesc); 4169 4170 // Fix the vector-loop phi. 4171 4172 // Reductions do not have to start at zero. They can start with 4173 // any loop invariant values. 4174 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4175 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4176 4177 for (unsigned Part = 0; Part < UF; ++Part) { 4178 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4179 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4180 // Make sure to add the reduction start value only to the 4181 // first unroll part. 4182 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4183 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4184 cast<PHINode>(VecRdxPhi) 4185 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4186 } 4187 4188 // Before each round, move the insertion point right between 4189 // the PHIs and the values we are going to write. 4190 // This allows us to write both PHINodes and the extractelement 4191 // instructions. 4192 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4193 4194 setDebugLocFromInst(Builder, LoopExitInst); 4195 4196 // If tail is folded by masking, the vector value to leave the loop should be 4197 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4198 // instead of the former. For an inloop reduction the reduction will already 4199 // be predicated, and does not need to be handled here. 4200 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4201 for (unsigned Part = 0; Part < UF; ++Part) { 4202 Value *VecLoopExitInst = 4203 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4204 Value *Sel = nullptr; 4205 for (User *U : VecLoopExitInst->users()) { 4206 if (isa<SelectInst>(U)) { 4207 assert(!Sel && "Reduction exit feeding two selects"); 4208 Sel = U; 4209 } else 4210 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4211 } 4212 assert(Sel && "Reduction exit feeds no select"); 4213 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4214 4215 // If the target can create a predicated operator for the reduction at no 4216 // extra cost in the loop (for example a predicated vadd), it can be 4217 // cheaper for the select to remain in the loop than be sunk out of it, 4218 // and so use the select value for the phi instead of the old 4219 // LoopExitValue. 4220 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4221 if (PreferPredicatedReductionSelect || 4222 TTI->preferPredicatedReductionSelect( 4223 RdxDesc.getRecurrenceBinOp(), Phi->getType(), 4224 TargetTransformInfo::ReductionFlags())) { 4225 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4226 VecRdxPhi->setIncomingValueForBlock( 4227 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4228 } 4229 } 4230 } 4231 4232 // If the vector reduction can be performed in a smaller type, we truncate 4233 // then extend the loop exit value to enable InstCombine to evaluate the 4234 // entire expression in the smaller type. 4235 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4236 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4237 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4238 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4239 Builder.SetInsertPoint( 4240 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4241 VectorParts RdxParts(UF); 4242 for (unsigned Part = 0; Part < UF; ++Part) { 4243 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4244 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4245 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4246 : Builder.CreateZExt(Trunc, VecTy); 4247 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4248 UI != RdxParts[Part]->user_end();) 4249 if (*UI != Trunc) { 4250 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4251 RdxParts[Part] = Extnd; 4252 } else { 4253 ++UI; 4254 } 4255 } 4256 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4259 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4260 } 4261 } 4262 4263 // Reduce all of the unrolled parts into a single vector. 4264 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4265 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4266 4267 // The middle block terminator has already been assigned a DebugLoc here (the 4268 // OrigLoop's single latch terminator). We want the whole middle block to 4269 // appear to execute on this line because: (a) it is all compiler generated, 4270 // (b) these instructions are always executed after evaluating the latch 4271 // conditional branch, and (c) other passes may add new predecessors which 4272 // terminate on this line. This is the easiest way to ensure we don't 4273 // accidentally cause an extra step back into the loop while debugging. 4274 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4275 for (unsigned Part = 1; Part < UF; ++Part) { 4276 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4277 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4278 // Floating point operations had to be 'fast' to enable the reduction. 4279 ReducedPartRdx = addFastMathFlag( 4280 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4281 ReducedPartRdx, "bin.rdx"), 4282 RdxDesc.getFastMathFlags()); 4283 else 4284 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4285 RdxPart); 4286 } 4287 4288 // Create the reduction after the loop. Note that inloop reductions create the 4289 // target reduction in the loop using a Reduction recipe. 4290 if (VF.isVector() && !IsInLoopReductionPhi) { 4291 bool NoNaN = Legal->hasFunNoNaNAttr(); 4292 ReducedPartRdx = 4293 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4294 // If the reduction can be performed in a smaller type, we need to extend 4295 // the reduction to the wider type before we branch to the original loop. 4296 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4297 ReducedPartRdx = 4298 RdxDesc.isSigned() 4299 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4300 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4301 } 4302 4303 // Create a phi node that merges control-flow from the backedge-taken check 4304 // block and the middle block. 4305 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4306 LoopScalarPreHeader->getTerminator()); 4307 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4308 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4309 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4310 4311 // Now, we need to fix the users of the reduction variable 4312 // inside and outside of the scalar remainder loop. 4313 // We know that the loop is in LCSSA form. We need to update the 4314 // PHI nodes in the exit blocks. 4315 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4316 // All PHINodes need to have a single entry edge, or two if 4317 // we already fixed them. 4318 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4319 4320 // We found a reduction value exit-PHI. Update it with the 4321 // incoming bypass edge. 4322 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4323 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4324 } // end of the LCSSA phi scan. 4325 4326 // Fix the scalar loop reduction variable with the incoming reduction sum 4327 // from the vector body and from the backedge value. 4328 int IncomingEdgeBlockIdx = 4329 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4330 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4331 // Pick the other block. 4332 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4333 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4334 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4335 } 4336 4337 void InnerLoopVectorizer::clearReductionWrapFlags( 4338 RecurrenceDescriptor &RdxDesc) { 4339 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4340 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4341 RK != RecurrenceDescriptor::RK_IntegerMult) 4342 return; 4343 4344 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4345 assert(LoopExitInstr && "null loop exit instruction"); 4346 SmallVector<Instruction *, 8> Worklist; 4347 SmallPtrSet<Instruction *, 8> Visited; 4348 Worklist.push_back(LoopExitInstr); 4349 Visited.insert(LoopExitInstr); 4350 4351 while (!Worklist.empty()) { 4352 Instruction *Cur = Worklist.pop_back_val(); 4353 if (isa<OverflowingBinaryOperator>(Cur)) 4354 for (unsigned Part = 0; Part < UF; ++Part) { 4355 Value *V = getOrCreateVectorValue(Cur, Part); 4356 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4357 } 4358 4359 for (User *U : Cur->users()) { 4360 Instruction *UI = cast<Instruction>(U); 4361 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4362 Visited.insert(UI).second) 4363 Worklist.push_back(UI); 4364 } 4365 } 4366 } 4367 4368 void InnerLoopVectorizer::fixLCSSAPHIs() { 4369 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4370 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4371 if (LCSSAPhi.getNumIncomingValues() == 1) { 4372 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4373 // Non-instruction incoming values will have only one value. 4374 unsigned LastLane = 0; 4375 if (isa<Instruction>(IncomingValue)) 4376 LastLane = Cost->isUniformAfterVectorization( 4377 cast<Instruction>(IncomingValue), VF) 4378 ? 0 4379 : VF.getKnownMinValue() - 1; 4380 // Can be a loop invariant incoming value or the last scalar value to be 4381 // extracted from the vectorized loop. 4382 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4383 Value *lastIncomingValue = 4384 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4385 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4386 } 4387 } 4388 } 4389 4390 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4391 // The basic block and loop containing the predicated instruction. 4392 auto *PredBB = PredInst->getParent(); 4393 auto *VectorLoop = LI->getLoopFor(PredBB); 4394 4395 // Initialize a worklist with the operands of the predicated instruction. 4396 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4397 4398 // Holds instructions that we need to analyze again. An instruction may be 4399 // reanalyzed if we don't yet know if we can sink it or not. 4400 SmallVector<Instruction *, 8> InstsToReanalyze; 4401 4402 // Returns true if a given use occurs in the predicated block. Phi nodes use 4403 // their operands in their corresponding predecessor blocks. 4404 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4405 auto *I = cast<Instruction>(U.getUser()); 4406 BasicBlock *BB = I->getParent(); 4407 if (auto *Phi = dyn_cast<PHINode>(I)) 4408 BB = Phi->getIncomingBlock( 4409 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4410 return BB == PredBB; 4411 }; 4412 4413 // Iteratively sink the scalarized operands of the predicated instruction 4414 // into the block we created for it. When an instruction is sunk, it's 4415 // operands are then added to the worklist. The algorithm ends after one pass 4416 // through the worklist doesn't sink a single instruction. 4417 bool Changed; 4418 do { 4419 // Add the instructions that need to be reanalyzed to the worklist, and 4420 // reset the changed indicator. 4421 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4422 InstsToReanalyze.clear(); 4423 Changed = false; 4424 4425 while (!Worklist.empty()) { 4426 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4427 4428 // We can't sink an instruction if it is a phi node, is already in the 4429 // predicated block, is not in the loop, or may have side effects. 4430 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4431 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4432 continue; 4433 4434 // It's legal to sink the instruction if all its uses occur in the 4435 // predicated block. Otherwise, there's nothing to do yet, and we may 4436 // need to reanalyze the instruction. 4437 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4438 InstsToReanalyze.push_back(I); 4439 continue; 4440 } 4441 4442 // Move the instruction to the beginning of the predicated block, and add 4443 // it's operands to the worklist. 4444 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4445 Worklist.insert(I->op_begin(), I->op_end()); 4446 4447 // The sinking may have enabled other instructions to be sunk, so we will 4448 // need to iterate. 4449 Changed = true; 4450 } 4451 } while (Changed); 4452 } 4453 4454 void InnerLoopVectorizer::fixNonInductionPHIs() { 4455 for (PHINode *OrigPhi : OrigPHIsToFix) { 4456 PHINode *NewPhi = 4457 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4458 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4459 4460 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4461 predecessors(OrigPhi->getParent())); 4462 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4463 predecessors(NewPhi->getParent())); 4464 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4465 "Scalar and Vector BB should have the same number of predecessors"); 4466 4467 // The insertion point in Builder may be invalidated by the time we get 4468 // here. Force the Builder insertion point to something valid so that we do 4469 // not run into issues during insertion point restore in 4470 // getOrCreateVectorValue calls below. 4471 Builder.SetInsertPoint(NewPhi); 4472 4473 // The predecessor order is preserved and we can rely on mapping between 4474 // scalar and vector block predecessors. 4475 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4476 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4477 4478 // When looking up the new scalar/vector values to fix up, use incoming 4479 // values from original phi. 4480 Value *ScIncV = 4481 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4482 4483 // Scalar incoming value may need a broadcast 4484 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4485 NewPhi->addIncoming(NewIncV, NewPredBB); 4486 } 4487 } 4488 } 4489 4490 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4491 VPUser &Operands, unsigned UF, 4492 ElementCount VF, bool IsPtrLoopInvariant, 4493 SmallBitVector &IsIndexLoopInvariant, 4494 VPTransformState &State) { 4495 // Construct a vector GEP by widening the operands of the scalar GEP as 4496 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4497 // results in a vector of pointers when at least one operand of the GEP 4498 // is vector-typed. Thus, to keep the representation compact, we only use 4499 // vector-typed operands for loop-varying values. 4500 4501 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4502 // If we are vectorizing, but the GEP has only loop-invariant operands, 4503 // the GEP we build (by only using vector-typed operands for 4504 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4505 // produce a vector of pointers, we need to either arbitrarily pick an 4506 // operand to broadcast, or broadcast a clone of the original GEP. 4507 // Here, we broadcast a clone of the original. 4508 // 4509 // TODO: If at some point we decide to scalarize instructions having 4510 // loop-invariant operands, this special case will no longer be 4511 // required. We would add the scalarization decision to 4512 // collectLoopScalars() and teach getVectorValue() to broadcast 4513 // the lane-zero scalar value. 4514 auto *Clone = Builder.Insert(GEP->clone()); 4515 for (unsigned Part = 0; Part < UF; ++Part) { 4516 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4517 State.set(VPDef, GEP, EntryPart, Part); 4518 addMetadata(EntryPart, GEP); 4519 } 4520 } else { 4521 // If the GEP has at least one loop-varying operand, we are sure to 4522 // produce a vector of pointers. But if we are only unrolling, we want 4523 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4524 // produce with the code below will be scalar (if VF == 1) or vector 4525 // (otherwise). Note that for the unroll-only case, we still maintain 4526 // values in the vector mapping with initVector, as we do for other 4527 // instructions. 4528 for (unsigned Part = 0; Part < UF; ++Part) { 4529 // The pointer operand of the new GEP. If it's loop-invariant, we 4530 // won't broadcast it. 4531 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4532 : State.get(Operands.getOperand(0), Part); 4533 4534 // Collect all the indices for the new GEP. If any index is 4535 // loop-invariant, we won't broadcast it. 4536 SmallVector<Value *, 4> Indices; 4537 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4538 VPValue *Operand = Operands.getOperand(I); 4539 if (IsIndexLoopInvariant[I - 1]) 4540 Indices.push_back(State.get(Operand, {0, 0})); 4541 else 4542 Indices.push_back(State.get(Operand, Part)); 4543 } 4544 4545 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4546 // but it should be a vector, otherwise. 4547 auto *NewGEP = 4548 GEP->isInBounds() 4549 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4550 Indices) 4551 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4552 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4553 "NewGEP is not a pointer vector"); 4554 State.set(VPDef, GEP, NewGEP, Part); 4555 addMetadata(NewGEP, GEP); 4556 } 4557 } 4558 } 4559 4560 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4561 ElementCount VF) { 4562 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4563 PHINode *P = cast<PHINode>(PN); 4564 if (EnableVPlanNativePath) { 4565 // Currently we enter here in the VPlan-native path for non-induction 4566 // PHIs where all control flow is uniform. We simply widen these PHIs. 4567 // Create a vector phi with no operands - the vector phi operands will be 4568 // set at the end of vector code generation. 4569 Type *VecTy = 4570 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4571 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4572 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4573 OrigPHIsToFix.push_back(P); 4574 4575 return; 4576 } 4577 4578 assert(PN->getParent() == OrigLoop->getHeader() && 4579 "Non-header phis should have been handled elsewhere"); 4580 4581 // In order to support recurrences we need to be able to vectorize Phi nodes. 4582 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4583 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4584 // this value when we vectorize all of the instructions that use the PHI. 4585 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4586 for (unsigned Part = 0; Part < UF; ++Part) { 4587 // This is phase one of vectorizing PHIs. 4588 bool ScalarPHI = 4589 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4590 Type *VecTy = 4591 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4592 Value *EntryPart = PHINode::Create( 4593 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4594 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4595 } 4596 return; 4597 } 4598 4599 setDebugLocFromInst(Builder, P); 4600 4601 // This PHINode must be an induction variable. 4602 // Make sure that we know about it. 4603 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4604 4605 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4606 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4607 4608 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4609 // which can be found from the original scalar operations. 4610 switch (II.getKind()) { 4611 case InductionDescriptor::IK_NoInduction: 4612 llvm_unreachable("Unknown induction"); 4613 case InductionDescriptor::IK_IntInduction: 4614 case InductionDescriptor::IK_FpInduction: 4615 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4616 case InductionDescriptor::IK_PtrInduction: { 4617 // Handle the pointer induction variable case. 4618 assert(P->getType()->isPointerTy() && "Unexpected type."); 4619 4620 if (Cost->isScalarAfterVectorization(P, VF)) { 4621 // This is the normalized GEP that starts counting at zero. 4622 Value *PtrInd = 4623 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4624 // Determine the number of scalars we need to generate for each unroll 4625 // iteration. If the instruction is uniform, we only need to generate the 4626 // first lane. Otherwise, we generate all VF values. 4627 unsigned Lanes = 4628 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4629 for (unsigned Part = 0; Part < UF; ++Part) { 4630 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4631 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4632 Lane + Part * VF.getKnownMinValue()); 4633 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4634 Value *SclrGep = 4635 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4636 SclrGep->setName("next.gep"); 4637 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4638 } 4639 } 4640 return; 4641 } 4642 assert(isa<SCEVConstant>(II.getStep()) && 4643 "Induction step not a SCEV constant!"); 4644 Type *PhiType = II.getStep()->getType(); 4645 4646 // Build a pointer phi 4647 Value *ScalarStartValue = II.getStartValue(); 4648 Type *ScStValueType = ScalarStartValue->getType(); 4649 PHINode *NewPointerPhi = 4650 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4651 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4652 4653 // A pointer induction, performed by using a gep 4654 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4655 Instruction *InductionLoc = LoopLatch->getTerminator(); 4656 const SCEV *ScalarStep = II.getStep(); 4657 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4658 Value *ScalarStepValue = 4659 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4660 Value *InductionGEP = GetElementPtrInst::Create( 4661 ScStValueType->getPointerElementType(), NewPointerPhi, 4662 Builder.CreateMul( 4663 ScalarStepValue, 4664 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4665 "ptr.ind", InductionLoc); 4666 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4667 4668 // Create UF many actual address geps that use the pointer 4669 // phi as base and a vectorized version of the step value 4670 // (<step*0, ..., step*N>) as offset. 4671 for (unsigned Part = 0; Part < UF; ++Part) { 4672 SmallVector<Constant *, 8> Indices; 4673 // Create a vector of consecutive numbers from zero to VF. 4674 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4675 Indices.push_back( 4676 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4677 Constant *StartOffset = ConstantVector::get(Indices); 4678 4679 Value *GEP = Builder.CreateGEP( 4680 ScStValueType->getPointerElementType(), NewPointerPhi, 4681 Builder.CreateMul( 4682 StartOffset, 4683 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4684 "vector.gep")); 4685 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4686 } 4687 } 4688 } 4689 } 4690 4691 /// A helper function for checking whether an integer division-related 4692 /// instruction may divide by zero (in which case it must be predicated if 4693 /// executed conditionally in the scalar code). 4694 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4695 /// Non-zero divisors that are non compile-time constants will not be 4696 /// converted into multiplication, so we will still end up scalarizing 4697 /// the division, but can do so w/o predication. 4698 static bool mayDivideByZero(Instruction &I) { 4699 assert((I.getOpcode() == Instruction::UDiv || 4700 I.getOpcode() == Instruction::SDiv || 4701 I.getOpcode() == Instruction::URem || 4702 I.getOpcode() == Instruction::SRem) && 4703 "Unexpected instruction"); 4704 Value *Divisor = I.getOperand(1); 4705 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4706 return !CInt || CInt->isZero(); 4707 } 4708 4709 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4710 VPUser &User, 4711 VPTransformState &State) { 4712 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4713 switch (I.getOpcode()) { 4714 case Instruction::Call: 4715 case Instruction::Br: 4716 case Instruction::PHI: 4717 case Instruction::GetElementPtr: 4718 case Instruction::Select: 4719 llvm_unreachable("This instruction is handled by a different recipe."); 4720 case Instruction::UDiv: 4721 case Instruction::SDiv: 4722 case Instruction::SRem: 4723 case Instruction::URem: 4724 case Instruction::Add: 4725 case Instruction::FAdd: 4726 case Instruction::Sub: 4727 case Instruction::FSub: 4728 case Instruction::FNeg: 4729 case Instruction::Mul: 4730 case Instruction::FMul: 4731 case Instruction::FDiv: 4732 case Instruction::FRem: 4733 case Instruction::Shl: 4734 case Instruction::LShr: 4735 case Instruction::AShr: 4736 case Instruction::And: 4737 case Instruction::Or: 4738 case Instruction::Xor: { 4739 // Just widen unops and binops. 4740 setDebugLocFromInst(Builder, &I); 4741 4742 for (unsigned Part = 0; Part < UF; ++Part) { 4743 SmallVector<Value *, 2> Ops; 4744 for (VPValue *VPOp : User.operands()) 4745 Ops.push_back(State.get(VPOp, Part)); 4746 4747 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4748 4749 if (auto *VecOp = dyn_cast<Instruction>(V)) 4750 VecOp->copyIRFlags(&I); 4751 4752 // Use this vector value for all users of the original instruction. 4753 State.set(Def, &I, V, Part); 4754 addMetadata(V, &I); 4755 } 4756 4757 break; 4758 } 4759 case Instruction::ICmp: 4760 case Instruction::FCmp: { 4761 // Widen compares. Generate vector compares. 4762 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4763 auto *Cmp = cast<CmpInst>(&I); 4764 setDebugLocFromInst(Builder, Cmp); 4765 for (unsigned Part = 0; Part < UF; ++Part) { 4766 Value *A = State.get(User.getOperand(0), Part); 4767 Value *B = State.get(User.getOperand(1), Part); 4768 Value *C = nullptr; 4769 if (FCmp) { 4770 // Propagate fast math flags. 4771 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4772 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4773 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4774 } else { 4775 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4776 } 4777 State.set(Def, &I, C, Part); 4778 addMetadata(C, &I); 4779 } 4780 4781 break; 4782 } 4783 4784 case Instruction::ZExt: 4785 case Instruction::SExt: 4786 case Instruction::FPToUI: 4787 case Instruction::FPToSI: 4788 case Instruction::FPExt: 4789 case Instruction::PtrToInt: 4790 case Instruction::IntToPtr: 4791 case Instruction::SIToFP: 4792 case Instruction::UIToFP: 4793 case Instruction::Trunc: 4794 case Instruction::FPTrunc: 4795 case Instruction::BitCast: { 4796 auto *CI = cast<CastInst>(&I); 4797 setDebugLocFromInst(Builder, CI); 4798 4799 /// Vectorize casts. 4800 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4801 Type *DestTy = 4802 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4803 4804 for (unsigned Part = 0; Part < UF; ++Part) { 4805 Value *A = State.get(User.getOperand(0), Part); 4806 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4807 State.set(Def, &I, Cast, Part); 4808 addMetadata(Cast, &I); 4809 } 4810 break; 4811 } 4812 default: 4813 // This instruction is not vectorized by simple widening. 4814 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4815 llvm_unreachable("Unhandled instruction!"); 4816 } // end of switch. 4817 } 4818 4819 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4820 VPUser &ArgOperands, 4821 VPTransformState &State) { 4822 assert(!isa<DbgInfoIntrinsic>(I) && 4823 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4824 setDebugLocFromInst(Builder, &I); 4825 4826 Module *M = I.getParent()->getParent()->getParent(); 4827 auto *CI = cast<CallInst>(&I); 4828 4829 SmallVector<Type *, 4> Tys; 4830 for (Value *ArgOperand : CI->arg_operands()) 4831 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4832 4833 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4834 4835 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4836 // version of the instruction. 4837 // Is it beneficial to perform intrinsic call compared to lib call? 4838 bool NeedToScalarize = false; 4839 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4840 bool UseVectorIntrinsic = 4841 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4842 assert((UseVectorIntrinsic || !NeedToScalarize) && 4843 "Instruction should be scalarized elsewhere."); 4844 4845 for (unsigned Part = 0; Part < UF; ++Part) { 4846 SmallVector<Value *, 4> Args; 4847 for (auto &I : enumerate(ArgOperands.operands())) { 4848 // Some intrinsics have a scalar argument - don't replace it with a 4849 // vector. 4850 Value *Arg; 4851 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4852 Arg = State.get(I.value(), Part); 4853 else 4854 Arg = State.get(I.value(), {0, 0}); 4855 Args.push_back(Arg); 4856 } 4857 4858 Function *VectorF; 4859 if (UseVectorIntrinsic) { 4860 // Use vector version of the intrinsic. 4861 Type *TysForDecl[] = {CI->getType()}; 4862 if (VF.isVector()) { 4863 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4864 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4865 } 4866 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4867 assert(VectorF && "Can't retrieve vector intrinsic."); 4868 } else { 4869 // Use vector version of the function call. 4870 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4871 #ifndef NDEBUG 4872 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4873 "Can't create vector function."); 4874 #endif 4875 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4876 } 4877 SmallVector<OperandBundleDef, 1> OpBundles; 4878 CI->getOperandBundlesAsDefs(OpBundles); 4879 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4880 4881 if (isa<FPMathOperator>(V)) 4882 V->copyFastMathFlags(CI); 4883 4884 State.set(Def, &I, V, Part); 4885 addMetadata(V, &I); 4886 } 4887 } 4888 4889 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4890 VPUser &Operands, 4891 bool InvariantCond, 4892 VPTransformState &State) { 4893 setDebugLocFromInst(Builder, &I); 4894 4895 // The condition can be loop invariant but still defined inside the 4896 // loop. This means that we can't just use the original 'cond' value. 4897 // We have to take the 'vectorized' value and pick the first lane. 4898 // Instcombine will make this a no-op. 4899 auto *InvarCond = 4900 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4901 4902 for (unsigned Part = 0; Part < UF; ++Part) { 4903 Value *Cond = 4904 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4905 Value *Op0 = State.get(Operands.getOperand(1), Part); 4906 Value *Op1 = State.get(Operands.getOperand(2), Part); 4907 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4908 State.set(VPDef, &I, Sel, Part); 4909 addMetadata(Sel, &I); 4910 } 4911 } 4912 4913 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4914 // We should not collect Scalars more than once per VF. Right now, this 4915 // function is called from collectUniformsAndScalars(), which already does 4916 // this check. Collecting Scalars for VF=1 does not make any sense. 4917 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4918 "This function should not be visited twice for the same VF"); 4919 4920 SmallSetVector<Instruction *, 8> Worklist; 4921 4922 // These sets are used to seed the analysis with pointers used by memory 4923 // accesses that will remain scalar. 4924 SmallSetVector<Instruction *, 8> ScalarPtrs; 4925 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4926 auto *Latch = TheLoop->getLoopLatch(); 4927 4928 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4929 // The pointer operands of loads and stores will be scalar as long as the 4930 // memory access is not a gather or scatter operation. The value operand of a 4931 // store will remain scalar if the store is scalarized. 4932 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4933 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4934 assert(WideningDecision != CM_Unknown && 4935 "Widening decision should be ready at this moment"); 4936 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4937 if (Ptr == Store->getValueOperand()) 4938 return WideningDecision == CM_Scalarize; 4939 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4940 "Ptr is neither a value or pointer operand"); 4941 return WideningDecision != CM_GatherScatter; 4942 }; 4943 4944 // A helper that returns true if the given value is a bitcast or 4945 // getelementptr instruction contained in the loop. 4946 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4947 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4948 isa<GetElementPtrInst>(V)) && 4949 !TheLoop->isLoopInvariant(V); 4950 }; 4951 4952 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4953 if (!isa<PHINode>(Ptr) || 4954 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4955 return false; 4956 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4957 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4958 return false; 4959 return isScalarUse(MemAccess, Ptr); 4960 }; 4961 4962 // A helper that evaluates a memory access's use of a pointer. If the 4963 // pointer is actually the pointer induction of a loop, it is being 4964 // inserted into Worklist. If the use will be a scalar use, and the 4965 // pointer is only used by memory accesses, we place the pointer in 4966 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4967 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4968 if (isScalarPtrInduction(MemAccess, Ptr)) { 4969 Worklist.insert(cast<Instruction>(Ptr)); 4970 Instruction *Update = cast<Instruction>( 4971 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4972 Worklist.insert(Update); 4973 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4974 << "\n"); 4975 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4976 << "\n"); 4977 return; 4978 } 4979 // We only care about bitcast and getelementptr instructions contained in 4980 // the loop. 4981 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4982 return; 4983 4984 // If the pointer has already been identified as scalar (e.g., if it was 4985 // also identified as uniform), there's nothing to do. 4986 auto *I = cast<Instruction>(Ptr); 4987 if (Worklist.count(I)) 4988 return; 4989 4990 // If the use of the pointer will be a scalar use, and all users of the 4991 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4992 // place the pointer in PossibleNonScalarPtrs. 4993 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4994 return isa<LoadInst>(U) || isa<StoreInst>(U); 4995 })) 4996 ScalarPtrs.insert(I); 4997 else 4998 PossibleNonScalarPtrs.insert(I); 4999 }; 5000 5001 // We seed the scalars analysis with three classes of instructions: (1) 5002 // instructions marked uniform-after-vectorization and (2) bitcast, 5003 // getelementptr and (pointer) phi instructions used by memory accesses 5004 // requiring a scalar use. 5005 // 5006 // (1) Add to the worklist all instructions that have been identified as 5007 // uniform-after-vectorization. 5008 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5009 5010 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5011 // memory accesses requiring a scalar use. The pointer operands of loads and 5012 // stores will be scalar as long as the memory accesses is not a gather or 5013 // scatter operation. The value operand of a store will remain scalar if the 5014 // store is scalarized. 5015 for (auto *BB : TheLoop->blocks()) 5016 for (auto &I : *BB) { 5017 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5018 evaluatePtrUse(Load, Load->getPointerOperand()); 5019 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5020 evaluatePtrUse(Store, Store->getPointerOperand()); 5021 evaluatePtrUse(Store, Store->getValueOperand()); 5022 } 5023 } 5024 for (auto *I : ScalarPtrs) 5025 if (!PossibleNonScalarPtrs.count(I)) { 5026 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5027 Worklist.insert(I); 5028 } 5029 5030 // Insert the forced scalars. 5031 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5032 // induction variable when the PHI user is scalarized. 5033 auto ForcedScalar = ForcedScalars.find(VF); 5034 if (ForcedScalar != ForcedScalars.end()) 5035 for (auto *I : ForcedScalar->second) 5036 Worklist.insert(I); 5037 5038 // Expand the worklist by looking through any bitcasts and getelementptr 5039 // instructions we've already identified as scalar. This is similar to the 5040 // expansion step in collectLoopUniforms(); however, here we're only 5041 // expanding to include additional bitcasts and getelementptr instructions. 5042 unsigned Idx = 0; 5043 while (Idx != Worklist.size()) { 5044 Instruction *Dst = Worklist[Idx++]; 5045 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5046 continue; 5047 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5048 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5049 auto *J = cast<Instruction>(U); 5050 return !TheLoop->contains(J) || Worklist.count(J) || 5051 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5052 isScalarUse(J, Src)); 5053 })) { 5054 Worklist.insert(Src); 5055 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5056 } 5057 } 5058 5059 // An induction variable will remain scalar if all users of the induction 5060 // variable and induction variable update remain scalar. 5061 for (auto &Induction : Legal->getInductionVars()) { 5062 auto *Ind = Induction.first; 5063 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5064 5065 // If tail-folding is applied, the primary induction variable will be used 5066 // to feed a vector compare. 5067 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5068 continue; 5069 5070 // Determine if all users of the induction variable are scalar after 5071 // vectorization. 5072 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5073 auto *I = cast<Instruction>(U); 5074 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5075 }); 5076 if (!ScalarInd) 5077 continue; 5078 5079 // Determine if all users of the induction variable update instruction are 5080 // scalar after vectorization. 5081 auto ScalarIndUpdate = 5082 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5083 auto *I = cast<Instruction>(U); 5084 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5085 }); 5086 if (!ScalarIndUpdate) 5087 continue; 5088 5089 // The induction variable and its update instruction will remain scalar. 5090 Worklist.insert(Ind); 5091 Worklist.insert(IndUpdate); 5092 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5093 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5094 << "\n"); 5095 } 5096 5097 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5098 } 5099 5100 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5101 ElementCount VF) { 5102 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5103 if (!blockNeedsPredication(I->getParent())) 5104 return false; 5105 switch(I->getOpcode()) { 5106 default: 5107 break; 5108 case Instruction::Load: 5109 case Instruction::Store: { 5110 if (!Legal->isMaskRequired(I)) 5111 return false; 5112 auto *Ptr = getLoadStorePointerOperand(I); 5113 auto *Ty = getMemInstValueType(I); 5114 // We have already decided how to vectorize this instruction, get that 5115 // result. 5116 if (VF.isVector()) { 5117 InstWidening WideningDecision = getWideningDecision(I, VF); 5118 assert(WideningDecision != CM_Unknown && 5119 "Widening decision should be ready at this moment"); 5120 return WideningDecision == CM_Scalarize; 5121 } 5122 const Align Alignment = getLoadStoreAlignment(I); 5123 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5124 isLegalMaskedGather(Ty, Alignment)) 5125 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5126 isLegalMaskedScatter(Ty, Alignment)); 5127 } 5128 case Instruction::UDiv: 5129 case Instruction::SDiv: 5130 case Instruction::SRem: 5131 case Instruction::URem: 5132 return mayDivideByZero(*I); 5133 } 5134 return false; 5135 } 5136 5137 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5138 Instruction *I, ElementCount VF) { 5139 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5140 assert(getWideningDecision(I, VF) == CM_Unknown && 5141 "Decision should not be set yet."); 5142 auto *Group = getInterleavedAccessGroup(I); 5143 assert(Group && "Must have a group."); 5144 5145 // If the instruction's allocated size doesn't equal it's type size, it 5146 // requires padding and will be scalarized. 5147 auto &DL = I->getModule()->getDataLayout(); 5148 auto *ScalarTy = getMemInstValueType(I); 5149 if (hasIrregularType(ScalarTy, DL, VF)) 5150 return false; 5151 5152 // Check if masking is required. 5153 // A Group may need masking for one of two reasons: it resides in a block that 5154 // needs predication, or it was decided to use masking to deal with gaps. 5155 bool PredicatedAccessRequiresMasking = 5156 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5157 bool AccessWithGapsRequiresMasking = 5158 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5159 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5160 return true; 5161 5162 // If masked interleaving is required, we expect that the user/target had 5163 // enabled it, because otherwise it either wouldn't have been created or 5164 // it should have been invalidated by the CostModel. 5165 assert(useMaskedInterleavedAccesses(TTI) && 5166 "Masked interleave-groups for predicated accesses are not enabled."); 5167 5168 auto *Ty = getMemInstValueType(I); 5169 const Align Alignment = getLoadStoreAlignment(I); 5170 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5171 : TTI.isLegalMaskedStore(Ty, Alignment); 5172 } 5173 5174 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5175 Instruction *I, ElementCount VF) { 5176 // Get and ensure we have a valid memory instruction. 5177 LoadInst *LI = dyn_cast<LoadInst>(I); 5178 StoreInst *SI = dyn_cast<StoreInst>(I); 5179 assert((LI || SI) && "Invalid memory instruction"); 5180 5181 auto *Ptr = getLoadStorePointerOperand(I); 5182 5183 // In order to be widened, the pointer should be consecutive, first of all. 5184 if (!Legal->isConsecutivePtr(Ptr)) 5185 return false; 5186 5187 // If the instruction is a store located in a predicated block, it will be 5188 // scalarized. 5189 if (isScalarWithPredication(I)) 5190 return false; 5191 5192 // If the instruction's allocated size doesn't equal it's type size, it 5193 // requires padding and will be scalarized. 5194 auto &DL = I->getModule()->getDataLayout(); 5195 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5196 if (hasIrregularType(ScalarTy, DL, VF)) 5197 return false; 5198 5199 return true; 5200 } 5201 5202 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5203 // We should not collect Uniforms more than once per VF. Right now, 5204 // this function is called from collectUniformsAndScalars(), which 5205 // already does this check. Collecting Uniforms for VF=1 does not make any 5206 // sense. 5207 5208 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5209 "This function should not be visited twice for the same VF"); 5210 5211 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5212 // not analyze again. Uniforms.count(VF) will return 1. 5213 Uniforms[VF].clear(); 5214 5215 // We now know that the loop is vectorizable! 5216 // Collect instructions inside the loop that will remain uniform after 5217 // vectorization. 5218 5219 // Global values, params and instructions outside of current loop are out of 5220 // scope. 5221 auto isOutOfScope = [&](Value *V) -> bool { 5222 Instruction *I = dyn_cast<Instruction>(V); 5223 return (!I || !TheLoop->contains(I)); 5224 }; 5225 5226 SetVector<Instruction *> Worklist; 5227 BasicBlock *Latch = TheLoop->getLoopLatch(); 5228 5229 // Instructions that are scalar with predication must not be considered 5230 // uniform after vectorization, because that would create an erroneous 5231 // replicating region where only a single instance out of VF should be formed. 5232 // TODO: optimize such seldom cases if found important, see PR40816. 5233 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5234 if (isOutOfScope(I)) { 5235 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5236 << *I << "\n"); 5237 return; 5238 } 5239 if (isScalarWithPredication(I, VF)) { 5240 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5241 << *I << "\n"); 5242 return; 5243 } 5244 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5245 Worklist.insert(I); 5246 }; 5247 5248 // Start with the conditional branch. If the branch condition is an 5249 // instruction contained in the loop that is only used by the branch, it is 5250 // uniform. 5251 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5252 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5253 addToWorklistIfAllowed(Cmp); 5254 5255 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5256 // are pointers that are treated like consecutive pointers during 5257 // vectorization. The pointer operands of interleaved accesses are an 5258 // example. 5259 SmallSetVector<Value *, 8> ConsecutiveLikePtrs; 5260 5261 // Holds pointer operands of instructions that are possibly non-uniform. 5262 SmallPtrSet<Value *, 8> PossibleNonUniformPtrs; 5263 5264 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5265 InstWidening WideningDecision = getWideningDecision(I, VF); 5266 assert(WideningDecision != CM_Unknown && 5267 "Widening decision should be ready at this moment"); 5268 5269 // The address of a uniform mem op is itself uniform. We exclude stores 5270 // here as there's an assumption in the current code that all uses of 5271 // uniform instructions are uniform and, as noted below, uniform stores are 5272 // still handled via replication (i.e. aren't uniform after vectorization). 5273 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5274 assert(WideningDecision == CM_Scalarize); 5275 return true; 5276 } 5277 5278 return (WideningDecision == CM_Widen || 5279 WideningDecision == CM_Widen_Reverse || 5280 WideningDecision == CM_Interleave); 5281 }; 5282 5283 5284 // Returns true if Ptr is the pointer operand of a memory access instruction 5285 // I, and I is known to not require scalarization. 5286 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5287 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5288 }; 5289 5290 // Iterate over the instructions in the loop, and collect all 5291 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5292 // that a consecutive-like pointer operand will be scalarized, we collect it 5293 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5294 // getelementptr instruction can be used by both vectorized and scalarized 5295 // memory instructions. For example, if a loop loads and stores from the same 5296 // location, but the store is conditional, the store will be scalarized, and 5297 // the getelementptr won't remain uniform. 5298 for (auto *BB : TheLoop->blocks()) 5299 for (auto &I : *BB) { 5300 // If there's no pointer operand, there's nothing to do. 5301 auto *Ptr = getLoadStorePointerOperand(&I); 5302 if (!Ptr) 5303 continue; 5304 5305 // For now, avoid walking use lists in other functions. 5306 // TODO: Rewrite this algorithm from uses up. 5307 if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr)) 5308 continue; 5309 5310 // A uniform memory op is itself uniform. We exclude stores here as we 5311 // haven't yet added dedicated logic in the CLONE path and rely on 5312 // REPLICATE + DSE for correctness. 5313 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5314 addToWorklistIfAllowed(&I); 5315 5316 // True if all users of Ptr are memory accesses that have Ptr as their 5317 // pointer operand. Since loops are assumed to be in LCSSA form, this 5318 // disallows uses outside the loop as well. 5319 auto UsersAreMemAccesses = 5320 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5321 return getLoadStorePointerOperand(U) == Ptr; 5322 }); 5323 5324 // Ensure the memory instruction will not be scalarized or used by 5325 // gather/scatter, making its pointer operand non-uniform. If the pointer 5326 // operand is used by any instruction other than a memory access, we 5327 // conservatively assume the pointer operand may be non-uniform. 5328 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5329 PossibleNonUniformPtrs.insert(Ptr); 5330 5331 // If the memory instruction will be vectorized and its pointer operand 5332 // is consecutive-like, or interleaving - the pointer operand should 5333 // remain uniform. 5334 else 5335 ConsecutiveLikePtrs.insert(Ptr); 5336 } 5337 5338 // Add to the Worklist all consecutive and consecutive-like pointers that 5339 // aren't also identified as possibly non-uniform. 5340 for (auto *V : ConsecutiveLikePtrs) 5341 if (!PossibleNonUniformPtrs.count(V)) 5342 if (auto *I = dyn_cast<Instruction>(V)) 5343 addToWorklistIfAllowed(I); 5344 5345 // Expand Worklist in topological order: whenever a new instruction 5346 // is added , its users should be already inside Worklist. It ensures 5347 // a uniform instruction will only be used by uniform instructions. 5348 unsigned idx = 0; 5349 while (idx != Worklist.size()) { 5350 Instruction *I = Worklist[idx++]; 5351 5352 for (auto OV : I->operand_values()) { 5353 // isOutOfScope operands cannot be uniform instructions. 5354 if (isOutOfScope(OV)) 5355 continue; 5356 // First order recurrence Phi's should typically be considered 5357 // non-uniform. 5358 auto *OP = dyn_cast<PHINode>(OV); 5359 if (OP && Legal->isFirstOrderRecurrence(OP)) 5360 continue; 5361 // If all the users of the operand are uniform, then add the 5362 // operand into the uniform worklist. 5363 auto *OI = cast<Instruction>(OV); 5364 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5365 auto *J = cast<Instruction>(U); 5366 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5367 })) 5368 addToWorklistIfAllowed(OI); 5369 } 5370 } 5371 5372 // For an instruction to be added into Worklist above, all its users inside 5373 // the loop should also be in Worklist. However, this condition cannot be 5374 // true for phi nodes that form a cyclic dependence. We must process phi 5375 // nodes separately. An induction variable will remain uniform if all users 5376 // of the induction variable and induction variable update remain uniform. 5377 // The code below handles both pointer and non-pointer induction variables. 5378 for (auto &Induction : Legal->getInductionVars()) { 5379 auto *Ind = Induction.first; 5380 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5381 5382 // Determine if all users of the induction variable are uniform after 5383 // vectorization. 5384 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5385 auto *I = cast<Instruction>(U); 5386 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5387 isVectorizedMemAccessUse(I, Ind); 5388 }); 5389 if (!UniformInd) 5390 continue; 5391 5392 // Determine if all users of the induction variable update instruction are 5393 // uniform after vectorization. 5394 auto UniformIndUpdate = 5395 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5396 auto *I = cast<Instruction>(U); 5397 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5398 isVectorizedMemAccessUse(I, IndUpdate); 5399 }); 5400 if (!UniformIndUpdate) 5401 continue; 5402 5403 // The induction variable and its update instruction will remain uniform. 5404 addToWorklistIfAllowed(Ind); 5405 addToWorklistIfAllowed(IndUpdate); 5406 } 5407 5408 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5409 } 5410 5411 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5412 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5413 5414 if (Legal->getRuntimePointerChecking()->Need) { 5415 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5416 "runtime pointer checks needed. Enable vectorization of this " 5417 "loop with '#pragma clang loop vectorize(enable)' when " 5418 "compiling with -Os/-Oz", 5419 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5420 return true; 5421 } 5422 5423 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5424 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5425 "runtime SCEV checks needed. Enable vectorization of this " 5426 "loop with '#pragma clang loop vectorize(enable)' when " 5427 "compiling with -Os/-Oz", 5428 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5429 return true; 5430 } 5431 5432 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5433 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5434 reportVectorizationFailure("Runtime stride check for small trip count", 5435 "runtime stride == 1 checks needed. Enable vectorization of " 5436 "this loop without such check by compiling with -Os/-Oz", 5437 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5438 return true; 5439 } 5440 5441 return false; 5442 } 5443 5444 Optional<ElementCount> 5445 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5446 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5447 // TODO: It may by useful to do since it's still likely to be dynamically 5448 // uniform if the target can skip. 5449 reportVectorizationFailure( 5450 "Not inserting runtime ptr check for divergent target", 5451 "runtime pointer checks needed. Not enabled for divergent target", 5452 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5453 return None; 5454 } 5455 5456 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5457 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5458 if (TC == 1) { 5459 reportVectorizationFailure("Single iteration (non) loop", 5460 "loop trip count is one, irrelevant for vectorization", 5461 "SingleIterationLoop", ORE, TheLoop); 5462 return None; 5463 } 5464 5465 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5466 5467 switch (ScalarEpilogueStatus) { 5468 case CM_ScalarEpilogueAllowed: 5469 return MaxVF; 5470 case CM_ScalarEpilogueNotNeededUsePredicate: 5471 LLVM_DEBUG( 5472 dbgs() << "LV: vector predicate hint/switch found.\n" 5473 << "LV: Not allowing scalar epilogue, creating predicated " 5474 << "vector loop.\n"); 5475 break; 5476 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5477 // fallthrough as a special case of OptForSize 5478 case CM_ScalarEpilogueNotAllowedOptSize: 5479 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5480 LLVM_DEBUG( 5481 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5482 else 5483 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5484 << "count.\n"); 5485 5486 // Bail if runtime checks are required, which are not good when optimising 5487 // for size. 5488 if (runtimeChecksRequired()) 5489 return None; 5490 break; 5491 } 5492 5493 // Now try the tail folding 5494 5495 // Invalidate interleave groups that require an epilogue if we can't mask 5496 // the interleave-group. 5497 if (!useMaskedInterleavedAccesses(TTI)) { 5498 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5499 "No decisions should have been taken at this point"); 5500 // Note: There is no need to invalidate any cost modeling decisions here, as 5501 // non where taken so far. 5502 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5503 } 5504 5505 assert(!MaxVF.isScalable() && 5506 "Scalable vectors do not yet support tail folding"); 5507 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5508 "MaxVF must be a power of 2"); 5509 unsigned MaxVFtimesIC = 5510 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5511 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5512 // Accept MaxVF if we do not have a tail. 5513 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5514 return MaxVF; 5515 } 5516 5517 // If we don't know the precise trip count, or if the trip count that we 5518 // found modulo the vectorization factor is not zero, try to fold the tail 5519 // by masking. 5520 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5521 if (Legal->prepareToFoldTailByMasking()) { 5522 FoldTailByMasking = true; 5523 return MaxVF; 5524 } 5525 5526 // If there was a tail-folding hint/switch, but we can't fold the tail by 5527 // masking, fallback to a vectorization with a scalar epilogue. 5528 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5529 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5530 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5531 return None; 5532 } 5533 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5534 "scalar epilogue instead.\n"); 5535 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5536 return MaxVF; 5537 } 5538 5539 if (TC == 0) { 5540 reportVectorizationFailure( 5541 "Unable to calculate the loop count due to complex control flow", 5542 "unable to calculate the loop count due to complex control flow", 5543 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5544 return None; 5545 } 5546 5547 reportVectorizationFailure( 5548 "Cannot optimize for size and vectorize at the same time.", 5549 "cannot optimize for size and vectorize at the same time. " 5550 "Enable vectorization of this loop with '#pragma clang loop " 5551 "vectorize(enable)' when compiling with -Os/-Oz", 5552 "NoTailLoopWithOptForSize", ORE, TheLoop); 5553 return None; 5554 } 5555 5556 ElementCount 5557 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5558 ElementCount UserVF) { 5559 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 5560 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5561 unsigned SmallestType, WidestType; 5562 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5563 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5564 5565 // Get the maximum safe dependence distance in bits computed by LAA. 5566 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5567 // the memory accesses that is most restrictive (involved in the smallest 5568 // dependence distance). 5569 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5570 5571 if (UserVF.isNonZero()) { 5572 // If legally unsafe, clamp the user vectorization factor to a safe value. 5573 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5574 if (UserVF.getFixedValue() <= MaxSafeVF) 5575 return UserVF; 5576 5577 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5578 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5579 << ".\n"); 5580 ORE->emit([&]() { 5581 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5582 TheLoop->getStartLoc(), 5583 TheLoop->getHeader()) 5584 << "User-specified vectorization factor " 5585 << ore::NV("UserVectorizationFactor", UserVF) 5586 << " is unsafe, clamping to maximum safe vectorization factor " 5587 << ore::NV("VectorizationFactor", MaxSafeVF); 5588 }); 5589 return ElementCount::getFixed(MaxSafeVF); 5590 } 5591 5592 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5593 5594 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5595 // Note that both WidestRegister and WidestType may not be a powers of 2. 5596 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5597 5598 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5599 << " / " << WidestType << " bits.\n"); 5600 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5601 << WidestRegister << " bits.\n"); 5602 5603 assert(MaxVectorSize <= WidestRegister && 5604 "Did not expect to pack so many elements" 5605 " into one vector!"); 5606 if (MaxVectorSize == 0) { 5607 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5608 MaxVectorSize = 1; 5609 return ElementCount::getFixed(MaxVectorSize); 5610 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5611 isPowerOf2_32(ConstTripCount)) { 5612 // We need to clamp the VF to be the ConstTripCount. There is no point in 5613 // choosing a higher viable VF as done in the loop below. 5614 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5615 << ConstTripCount << "\n"); 5616 MaxVectorSize = ConstTripCount; 5617 return ElementCount::getFixed(MaxVectorSize); 5618 } 5619 5620 unsigned MaxVF = MaxVectorSize; 5621 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5622 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5623 // Collect all viable vectorization factors larger than the default MaxVF 5624 // (i.e. MaxVectorSize). 5625 SmallVector<ElementCount, 8> VFs; 5626 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5627 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5628 VFs.push_back(ElementCount::getFixed(VS)); 5629 5630 // For each VF calculate its register usage. 5631 auto RUs = calculateRegisterUsage(VFs); 5632 5633 // Select the largest VF which doesn't require more registers than existing 5634 // ones. 5635 for (int i = RUs.size() - 1; i >= 0; --i) { 5636 bool Selected = true; 5637 for (auto& pair : RUs[i].MaxLocalUsers) { 5638 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5639 if (pair.second > TargetNumRegisters) 5640 Selected = false; 5641 } 5642 if (Selected) { 5643 MaxVF = VFs[i].getKnownMinValue(); 5644 break; 5645 } 5646 } 5647 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5648 if (MaxVF < MinVF) { 5649 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5650 << ") with target's minimum: " << MinVF << '\n'); 5651 MaxVF = MinVF; 5652 } 5653 } 5654 } 5655 return ElementCount::getFixed(MaxVF); 5656 } 5657 5658 VectorizationFactor 5659 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5660 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5661 5662 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5663 const float ScalarCost = Cost; 5664 unsigned Width = 1; 5665 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5666 5667 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5668 if (ForceVectorization && MaxVF.isVector()) { 5669 // Ignore scalar width, because the user explicitly wants vectorization. 5670 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5671 // evaluation. 5672 Cost = std::numeric_limits<float>::max(); 5673 } 5674 5675 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5676 // Notice that the vector loop needs to be executed less times, so 5677 // we need to divide the cost of the vector loops by the width of 5678 // the vector elements. 5679 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5680 float VectorCost = C.first / (float)i; 5681 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5682 << " costs: " << (int)VectorCost << ".\n"); 5683 if (!C.second && !ForceVectorization) { 5684 LLVM_DEBUG( 5685 dbgs() << "LV: Not considering vector loop of width " << i 5686 << " because it will not generate any vector instructions.\n"); 5687 continue; 5688 } 5689 5690 // If profitable add it to ProfitableVF list. 5691 if (VectorCost < ScalarCost) { 5692 ProfitableVFs.push_back(VectorizationFactor( 5693 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5694 } 5695 5696 if (VectorCost < Cost) { 5697 Cost = VectorCost; 5698 Width = i; 5699 } 5700 } 5701 5702 if (!EnableCondStoresVectorization && NumPredStores) { 5703 reportVectorizationFailure("There are conditional stores.", 5704 "store that is conditionally executed prevents vectorization", 5705 "ConditionalStore", ORE, TheLoop); 5706 Width = 1; 5707 Cost = ScalarCost; 5708 } 5709 5710 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5711 << "LV: Vectorization seems to be not beneficial, " 5712 << "but was forced by a user.\n"); 5713 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5714 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5715 (unsigned)(Width * Cost)}; 5716 return Factor; 5717 } 5718 5719 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5720 const Loop &L, ElementCount VF) const { 5721 // Cross iteration phis such as reductions need special handling and are 5722 // currently unsupported. 5723 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5724 return Legal->isFirstOrderRecurrence(&Phi) || 5725 Legal->isReductionVariable(&Phi); 5726 })) 5727 return false; 5728 5729 // Phis with uses outside of the loop require special handling and are 5730 // currently unsupported. 5731 for (auto &Entry : Legal->getInductionVars()) { 5732 // Look for uses of the value of the induction at the last iteration. 5733 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5734 for (User *U : PostInc->users()) 5735 if (!L.contains(cast<Instruction>(U))) 5736 return false; 5737 // Look for uses of penultimate value of the induction. 5738 for (User *U : Entry.first->users()) 5739 if (!L.contains(cast<Instruction>(U))) 5740 return false; 5741 } 5742 5743 // Induction variables that are widened require special handling that is 5744 // currently not supported. 5745 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5746 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5747 this->isProfitableToScalarize(Entry.first, VF)); 5748 })) 5749 return false; 5750 5751 return true; 5752 } 5753 5754 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5755 const ElementCount VF) const { 5756 // FIXME: We need a much better cost-model to take different parameters such 5757 // as register pressure, code size increase and cost of extra branches into 5758 // account. For now we apply a very crude heuristic and only consider loops 5759 // with vectorization factors larger than a certain value. 5760 // We also consider epilogue vectorization unprofitable for targets that don't 5761 // consider interleaving beneficial (eg. MVE). 5762 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5763 return false; 5764 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5765 return true; 5766 return false; 5767 } 5768 5769 VectorizationFactor 5770 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5771 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5772 VectorizationFactor Result = VectorizationFactor::Disabled(); 5773 if (!EnableEpilogueVectorization) { 5774 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5775 return Result; 5776 } 5777 5778 if (!isScalarEpilogueAllowed()) { 5779 LLVM_DEBUG( 5780 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5781 "allowed.\n";); 5782 return Result; 5783 } 5784 5785 // Not really a cost consideration, but check for unsupported cases here to 5786 // simplify the logic. 5787 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5788 LLVM_DEBUG( 5789 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5790 "not a supported candidate.\n";); 5791 return Result; 5792 } 5793 5794 if (EpilogueVectorizationForceVF > 1) { 5795 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5796 if (LVP.hasPlanWithVFs( 5797 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5798 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5799 else { 5800 LLVM_DEBUG( 5801 dbgs() 5802 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5803 return Result; 5804 } 5805 } 5806 5807 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5808 TheLoop->getHeader()->getParent()->hasMinSize()) { 5809 LLVM_DEBUG( 5810 dbgs() 5811 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5812 return Result; 5813 } 5814 5815 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5816 return Result; 5817 5818 for (auto &NextVF : ProfitableVFs) 5819 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5820 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5821 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5822 Result = NextVF; 5823 5824 if (Result != VectorizationFactor::Disabled()) 5825 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5826 << Result.Width.getFixedValue() << "\n";); 5827 return Result; 5828 } 5829 5830 std::pair<unsigned, unsigned> 5831 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5832 unsigned MinWidth = -1U; 5833 unsigned MaxWidth = 8; 5834 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5835 5836 // For each block. 5837 for (BasicBlock *BB : TheLoop->blocks()) { 5838 // For each instruction in the loop. 5839 for (Instruction &I : BB->instructionsWithoutDebug()) { 5840 Type *T = I.getType(); 5841 5842 // Skip ignored values. 5843 if (ValuesToIgnore.count(&I)) 5844 continue; 5845 5846 // Only examine Loads, Stores and PHINodes. 5847 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5848 continue; 5849 5850 // Examine PHI nodes that are reduction variables. Update the type to 5851 // account for the recurrence type. 5852 if (auto *PN = dyn_cast<PHINode>(&I)) { 5853 if (!Legal->isReductionVariable(PN)) 5854 continue; 5855 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5856 T = RdxDesc.getRecurrenceType(); 5857 } 5858 5859 // Examine the stored values. 5860 if (auto *ST = dyn_cast<StoreInst>(&I)) 5861 T = ST->getValueOperand()->getType(); 5862 5863 // Ignore loaded pointer types and stored pointer types that are not 5864 // vectorizable. 5865 // 5866 // FIXME: The check here attempts to predict whether a load or store will 5867 // be vectorized. We only know this for certain after a VF has 5868 // been selected. Here, we assume that if an access can be 5869 // vectorized, it will be. We should also look at extending this 5870 // optimization to non-pointer types. 5871 // 5872 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5873 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5874 continue; 5875 5876 MinWidth = std::min(MinWidth, 5877 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5878 MaxWidth = std::max(MaxWidth, 5879 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5880 } 5881 } 5882 5883 return {MinWidth, MaxWidth}; 5884 } 5885 5886 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5887 unsigned LoopCost) { 5888 // -- The interleave heuristics -- 5889 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5890 // There are many micro-architectural considerations that we can't predict 5891 // at this level. For example, frontend pressure (on decode or fetch) due to 5892 // code size, or the number and capabilities of the execution ports. 5893 // 5894 // We use the following heuristics to select the interleave count: 5895 // 1. If the code has reductions, then we interleave to break the cross 5896 // iteration dependency. 5897 // 2. If the loop is really small, then we interleave to reduce the loop 5898 // overhead. 5899 // 3. We don't interleave if we think that we will spill registers to memory 5900 // due to the increased register pressure. 5901 5902 if (!isScalarEpilogueAllowed()) 5903 return 1; 5904 5905 // We used the distance for the interleave count. 5906 if (Legal->getMaxSafeDepDistBytes() != -1U) 5907 return 1; 5908 5909 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5910 const bool HasReductions = !Legal->getReductionVars().empty(); 5911 // Do not interleave loops with a relatively small known or estimated trip 5912 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5913 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5914 // because with the above conditions interleaving can expose ILP and break 5915 // cross iteration dependences for reductions. 5916 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5917 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5918 return 1; 5919 5920 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5921 // We divide by these constants so assume that we have at least one 5922 // instruction that uses at least one register. 5923 for (auto& pair : R.MaxLocalUsers) { 5924 pair.second = std::max(pair.second, 1U); 5925 } 5926 5927 // We calculate the interleave count using the following formula. 5928 // Subtract the number of loop invariants from the number of available 5929 // registers. These registers are used by all of the interleaved instances. 5930 // Next, divide the remaining registers by the number of registers that is 5931 // required by the loop, in order to estimate how many parallel instances 5932 // fit without causing spills. All of this is rounded down if necessary to be 5933 // a power of two. We want power of two interleave count to simplify any 5934 // addressing operations or alignment considerations. 5935 // We also want power of two interleave counts to ensure that the induction 5936 // variable of the vector loop wraps to zero, when tail is folded by masking; 5937 // this currently happens when OptForSize, in which case IC is set to 1 above. 5938 unsigned IC = UINT_MAX; 5939 5940 for (auto& pair : R.MaxLocalUsers) { 5941 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5942 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5943 << " registers of " 5944 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5945 if (VF.isScalar()) { 5946 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5947 TargetNumRegisters = ForceTargetNumScalarRegs; 5948 } else { 5949 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5950 TargetNumRegisters = ForceTargetNumVectorRegs; 5951 } 5952 unsigned MaxLocalUsers = pair.second; 5953 unsigned LoopInvariantRegs = 0; 5954 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5955 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5956 5957 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5958 // Don't count the induction variable as interleaved. 5959 if (EnableIndVarRegisterHeur) { 5960 TmpIC = 5961 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5962 std::max(1U, (MaxLocalUsers - 1))); 5963 } 5964 5965 IC = std::min(IC, TmpIC); 5966 } 5967 5968 // Clamp the interleave ranges to reasonable counts. 5969 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5970 unsigned MaxInterleaveCount = 5971 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5972 5973 // Check if the user has overridden the max. 5974 if (VF.isScalar()) { 5975 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5976 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5977 } else { 5978 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5979 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5980 } 5981 5982 // If trip count is known or estimated compile time constant, limit the 5983 // interleave count to be less than the trip count divided by VF, provided it 5984 // is at least 1. 5985 if (BestKnownTC) { 5986 MaxInterleaveCount = 5987 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5988 // Make sure MaxInterleaveCount is greater than 0. 5989 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5990 } 5991 5992 assert(MaxInterleaveCount > 0 && 5993 "Maximum interleave count must be greater than 0"); 5994 5995 // Clamp the calculated IC to be between the 1 and the max interleave count 5996 // that the target and trip count allows. 5997 if (IC > MaxInterleaveCount) 5998 IC = MaxInterleaveCount; 5999 else 6000 // Make sure IC is greater than 0. 6001 IC = std::max(1u, IC); 6002 6003 assert(IC > 0 && "Interleave count must be greater than 0."); 6004 6005 // If we did not calculate the cost for VF (because the user selected the VF) 6006 // then we calculate the cost of VF here. 6007 if (LoopCost == 0) 6008 LoopCost = expectedCost(VF).first; 6009 6010 assert(LoopCost && "Non-zero loop cost expected"); 6011 6012 // Interleave if we vectorized this loop and there is a reduction that could 6013 // benefit from interleaving. 6014 if (VF.isVector() && HasReductions) { 6015 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6016 return IC; 6017 } 6018 6019 // Note that if we've already vectorized the loop we will have done the 6020 // runtime check and so interleaving won't require further checks. 6021 bool InterleavingRequiresRuntimePointerCheck = 6022 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6023 6024 // We want to interleave small loops in order to reduce the loop overhead and 6025 // potentially expose ILP opportunities. 6026 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6027 << "LV: IC is " << IC << '\n' 6028 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 6029 const bool AggressivelyInterleaveReductions = 6030 TTI.enableAggressiveInterleaving(HasReductions); 6031 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6032 // We assume that the cost overhead is 1 and we use the cost model 6033 // to estimate the cost of the loop and interleave until the cost of the 6034 // loop overhead is about 5% of the cost of the loop. 6035 unsigned SmallIC = 6036 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6037 6038 // Interleave until store/load ports (estimated by max interleave count) are 6039 // saturated. 6040 unsigned NumStores = Legal->getNumStores(); 6041 unsigned NumLoads = Legal->getNumLoads(); 6042 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6043 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6044 6045 // If we have a scalar reduction (vector reductions are already dealt with 6046 // by this point), we can increase the critical path length if the loop 6047 // we're interleaving is inside another loop. Limit, by default to 2, so the 6048 // critical path only gets increased by one reduction operation. 6049 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6050 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6051 SmallIC = std::min(SmallIC, F); 6052 StoresIC = std::min(StoresIC, F); 6053 LoadsIC = std::min(LoadsIC, F); 6054 } 6055 6056 if (EnableLoadStoreRuntimeInterleave && 6057 std::max(StoresIC, LoadsIC) > SmallIC) { 6058 LLVM_DEBUG( 6059 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6060 return std::max(StoresIC, LoadsIC); 6061 } 6062 6063 // If there are scalar reductions and TTI has enabled aggressive 6064 // interleaving for reductions, we will interleave to expose ILP. 6065 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6066 AggressivelyInterleaveReductions) { 6067 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6068 // Interleave no less than SmallIC but not as aggressive as the normal IC 6069 // to satisfy the rare situation when resources are too limited. 6070 return std::max(IC / 2, SmallIC); 6071 } else { 6072 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6073 return SmallIC; 6074 } 6075 } 6076 6077 // Interleave if this is a large loop (small loops are already dealt with by 6078 // this point) that could benefit from interleaving. 6079 if (AggressivelyInterleaveReductions) { 6080 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6081 return IC; 6082 } 6083 6084 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6085 return 1; 6086 } 6087 6088 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6089 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6090 // This function calculates the register usage by measuring the highest number 6091 // of values that are alive at a single location. Obviously, this is a very 6092 // rough estimation. We scan the loop in a topological order in order and 6093 // assign a number to each instruction. We use RPO to ensure that defs are 6094 // met before their users. We assume that each instruction that has in-loop 6095 // users starts an interval. We record every time that an in-loop value is 6096 // used, so we have a list of the first and last occurrences of each 6097 // instruction. Next, we transpose this data structure into a multi map that 6098 // holds the list of intervals that *end* at a specific location. This multi 6099 // map allows us to perform a linear search. We scan the instructions linearly 6100 // and record each time that a new interval starts, by placing it in a set. 6101 // If we find this value in the multi-map then we remove it from the set. 6102 // The max register usage is the maximum size of the set. 6103 // We also search for instructions that are defined outside the loop, but are 6104 // used inside the loop. We need this number separately from the max-interval 6105 // usage number because when we unroll, loop-invariant values do not take 6106 // more register. 6107 LoopBlocksDFS DFS(TheLoop); 6108 DFS.perform(LI); 6109 6110 RegisterUsage RU; 6111 6112 // Each 'key' in the map opens a new interval. The values 6113 // of the map are the index of the 'last seen' usage of the 6114 // instruction that is the key. 6115 using IntervalMap = DenseMap<Instruction *, unsigned>; 6116 6117 // Maps instruction to its index. 6118 SmallVector<Instruction *, 64> IdxToInstr; 6119 // Marks the end of each interval. 6120 IntervalMap EndPoint; 6121 // Saves the list of instruction indices that are used in the loop. 6122 SmallPtrSet<Instruction *, 8> Ends; 6123 // Saves the list of values that are used in the loop but are 6124 // defined outside the loop, such as arguments and constants. 6125 SmallPtrSet<Value *, 8> LoopInvariants; 6126 6127 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6128 for (Instruction &I : BB->instructionsWithoutDebug()) { 6129 IdxToInstr.push_back(&I); 6130 6131 // Save the end location of each USE. 6132 for (Value *U : I.operands()) { 6133 auto *Instr = dyn_cast<Instruction>(U); 6134 6135 // Ignore non-instruction values such as arguments, constants, etc. 6136 if (!Instr) 6137 continue; 6138 6139 // If this instruction is outside the loop then record it and continue. 6140 if (!TheLoop->contains(Instr)) { 6141 LoopInvariants.insert(Instr); 6142 continue; 6143 } 6144 6145 // Overwrite previous end points. 6146 EndPoint[Instr] = IdxToInstr.size(); 6147 Ends.insert(Instr); 6148 } 6149 } 6150 } 6151 6152 // Saves the list of intervals that end with the index in 'key'. 6153 using InstrList = SmallVector<Instruction *, 2>; 6154 DenseMap<unsigned, InstrList> TransposeEnds; 6155 6156 // Transpose the EndPoints to a list of values that end at each index. 6157 for (auto &Interval : EndPoint) 6158 TransposeEnds[Interval.second].push_back(Interval.first); 6159 6160 SmallPtrSet<Instruction *, 8> OpenIntervals; 6161 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6162 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6163 6164 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6165 6166 // A lambda that gets the register usage for the given type and VF. 6167 const auto &TTICapture = TTI; 6168 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6169 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6170 return 0U; 6171 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6172 }; 6173 6174 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6175 Instruction *I = IdxToInstr[i]; 6176 6177 // Remove all of the instructions that end at this location. 6178 InstrList &List = TransposeEnds[i]; 6179 for (Instruction *ToRemove : List) 6180 OpenIntervals.erase(ToRemove); 6181 6182 // Ignore instructions that are never used within the loop. 6183 if (!Ends.count(I)) 6184 continue; 6185 6186 // Skip ignored values. 6187 if (ValuesToIgnore.count(I)) 6188 continue; 6189 6190 // For each VF find the maximum usage of registers. 6191 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6192 // Count the number of live intervals. 6193 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6194 6195 if (VFs[j].isScalar()) { 6196 for (auto Inst : OpenIntervals) { 6197 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6198 if (RegUsage.find(ClassID) == RegUsage.end()) 6199 RegUsage[ClassID] = 1; 6200 else 6201 RegUsage[ClassID] += 1; 6202 } 6203 } else { 6204 collectUniformsAndScalars(VFs[j]); 6205 for (auto Inst : OpenIntervals) { 6206 // Skip ignored values for VF > 1. 6207 if (VecValuesToIgnore.count(Inst)) 6208 continue; 6209 if (isScalarAfterVectorization(Inst, VFs[j])) { 6210 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6211 if (RegUsage.find(ClassID) == RegUsage.end()) 6212 RegUsage[ClassID] = 1; 6213 else 6214 RegUsage[ClassID] += 1; 6215 } else { 6216 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6217 if (RegUsage.find(ClassID) == RegUsage.end()) 6218 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6219 else 6220 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6221 } 6222 } 6223 } 6224 6225 for (auto& pair : RegUsage) { 6226 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6227 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6228 else 6229 MaxUsages[j][pair.first] = pair.second; 6230 } 6231 } 6232 6233 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6234 << OpenIntervals.size() << '\n'); 6235 6236 // Add the current instruction to the list of open intervals. 6237 OpenIntervals.insert(I); 6238 } 6239 6240 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6241 SmallMapVector<unsigned, unsigned, 4> Invariant; 6242 6243 for (auto Inst : LoopInvariants) { 6244 unsigned Usage = 6245 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6246 unsigned ClassID = 6247 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6248 if (Invariant.find(ClassID) == Invariant.end()) 6249 Invariant[ClassID] = Usage; 6250 else 6251 Invariant[ClassID] += Usage; 6252 } 6253 6254 LLVM_DEBUG({ 6255 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6256 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6257 << " item\n"; 6258 for (const auto &pair : MaxUsages[i]) { 6259 dbgs() << "LV(REG): RegisterClass: " 6260 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6261 << " registers\n"; 6262 } 6263 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6264 << " item\n"; 6265 for (const auto &pair : Invariant) { 6266 dbgs() << "LV(REG): RegisterClass: " 6267 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6268 << " registers\n"; 6269 } 6270 }); 6271 6272 RU.LoopInvariantRegs = Invariant; 6273 RU.MaxLocalUsers = MaxUsages[i]; 6274 RUs[i] = RU; 6275 } 6276 6277 return RUs; 6278 } 6279 6280 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6281 // TODO: Cost model for emulated masked load/store is completely 6282 // broken. This hack guides the cost model to use an artificially 6283 // high enough value to practically disable vectorization with such 6284 // operations, except where previously deployed legality hack allowed 6285 // using very low cost values. This is to avoid regressions coming simply 6286 // from moving "masked load/store" check from legality to cost model. 6287 // Masked Load/Gather emulation was previously never allowed. 6288 // Limited number of Masked Store/Scatter emulation was allowed. 6289 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6290 return isa<LoadInst>(I) || 6291 (isa<StoreInst>(I) && 6292 NumPredStores > NumberOfStoresToPredicate); 6293 } 6294 6295 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6296 // If we aren't vectorizing the loop, or if we've already collected the 6297 // instructions to scalarize, there's nothing to do. Collection may already 6298 // have occurred if we have a user-selected VF and are now computing the 6299 // expected cost for interleaving. 6300 if (VF.isScalar() || VF.isZero() || 6301 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6302 return; 6303 6304 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6305 // not profitable to scalarize any instructions, the presence of VF in the 6306 // map will indicate that we've analyzed it already. 6307 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6308 6309 // Find all the instructions that are scalar with predication in the loop and 6310 // determine if it would be better to not if-convert the blocks they are in. 6311 // If so, we also record the instructions to scalarize. 6312 for (BasicBlock *BB : TheLoop->blocks()) { 6313 if (!blockNeedsPredication(BB)) 6314 continue; 6315 for (Instruction &I : *BB) 6316 if (isScalarWithPredication(&I)) { 6317 ScalarCostsTy ScalarCosts; 6318 // Do not apply discount logic if hacked cost is needed 6319 // for emulated masked memrefs. 6320 if (!useEmulatedMaskMemRefHack(&I) && 6321 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6322 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6323 // Remember that BB will remain after vectorization. 6324 PredicatedBBsAfterVectorization.insert(BB); 6325 } 6326 } 6327 } 6328 6329 int LoopVectorizationCostModel::computePredInstDiscount( 6330 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6331 ElementCount VF) { 6332 assert(!isUniformAfterVectorization(PredInst, VF) && 6333 "Instruction marked uniform-after-vectorization will be predicated"); 6334 6335 // Initialize the discount to zero, meaning that the scalar version and the 6336 // vector version cost the same. 6337 int Discount = 0; 6338 6339 // Holds instructions to analyze. The instructions we visit are mapped in 6340 // ScalarCosts. Those instructions are the ones that would be scalarized if 6341 // we find that the scalar version costs less. 6342 SmallVector<Instruction *, 8> Worklist; 6343 6344 // Returns true if the given instruction can be scalarized. 6345 auto canBeScalarized = [&](Instruction *I) -> bool { 6346 // We only attempt to scalarize instructions forming a single-use chain 6347 // from the original predicated block that would otherwise be vectorized. 6348 // Although not strictly necessary, we give up on instructions we know will 6349 // already be scalar to avoid traversing chains that are unlikely to be 6350 // beneficial. 6351 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6352 isScalarAfterVectorization(I, VF)) 6353 return false; 6354 6355 // If the instruction is scalar with predication, it will be analyzed 6356 // separately. We ignore it within the context of PredInst. 6357 if (isScalarWithPredication(I)) 6358 return false; 6359 6360 // If any of the instruction's operands are uniform after vectorization, 6361 // the instruction cannot be scalarized. This prevents, for example, a 6362 // masked load from being scalarized. 6363 // 6364 // We assume we will only emit a value for lane zero of an instruction 6365 // marked uniform after vectorization, rather than VF identical values. 6366 // Thus, if we scalarize an instruction that uses a uniform, we would 6367 // create uses of values corresponding to the lanes we aren't emitting code 6368 // for. This behavior can be changed by allowing getScalarValue to clone 6369 // the lane zero values for uniforms rather than asserting. 6370 for (Use &U : I->operands()) 6371 if (auto *J = dyn_cast<Instruction>(U.get())) 6372 if (isUniformAfterVectorization(J, VF)) 6373 return false; 6374 6375 // Otherwise, we can scalarize the instruction. 6376 return true; 6377 }; 6378 6379 // Compute the expected cost discount from scalarizing the entire expression 6380 // feeding the predicated instruction. We currently only consider expressions 6381 // that are single-use instruction chains. 6382 Worklist.push_back(PredInst); 6383 while (!Worklist.empty()) { 6384 Instruction *I = Worklist.pop_back_val(); 6385 6386 // If we've already analyzed the instruction, there's nothing to do. 6387 if (ScalarCosts.find(I) != ScalarCosts.end()) 6388 continue; 6389 6390 // Compute the cost of the vector instruction. Note that this cost already 6391 // includes the scalarization overhead of the predicated instruction. 6392 unsigned VectorCost = getInstructionCost(I, VF).first; 6393 6394 // Compute the cost of the scalarized instruction. This cost is the cost of 6395 // the instruction as if it wasn't if-converted and instead remained in the 6396 // predicated block. We will scale this cost by block probability after 6397 // computing the scalarization overhead. 6398 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6399 unsigned ScalarCost = 6400 VF.getKnownMinValue() * 6401 getInstructionCost(I, ElementCount::getFixed(1)).first; 6402 6403 // Compute the scalarization overhead of needed insertelement instructions 6404 // and phi nodes. 6405 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6406 ScalarCost += TTI.getScalarizationOverhead( 6407 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6408 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6409 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6410 ScalarCost += 6411 VF.getKnownMinValue() * 6412 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6413 } 6414 6415 // Compute the scalarization overhead of needed extractelement 6416 // instructions. For each of the instruction's operands, if the operand can 6417 // be scalarized, add it to the worklist; otherwise, account for the 6418 // overhead. 6419 for (Use &U : I->operands()) 6420 if (auto *J = dyn_cast<Instruction>(U.get())) { 6421 assert(VectorType::isValidElementType(J->getType()) && 6422 "Instruction has non-scalar type"); 6423 if (canBeScalarized(J)) 6424 Worklist.push_back(J); 6425 else if (needsExtract(J, VF)) { 6426 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6427 ScalarCost += TTI.getScalarizationOverhead( 6428 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6429 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6430 } 6431 } 6432 6433 // Scale the total scalar cost by block probability. 6434 ScalarCost /= getReciprocalPredBlockProb(); 6435 6436 // Compute the discount. A non-negative discount means the vector version 6437 // of the instruction costs more, and scalarizing would be beneficial. 6438 Discount += VectorCost - ScalarCost; 6439 ScalarCosts[I] = ScalarCost; 6440 } 6441 6442 return Discount; 6443 } 6444 6445 LoopVectorizationCostModel::VectorizationCostTy 6446 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6447 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6448 VectorizationCostTy Cost; 6449 6450 // For each block. 6451 for (BasicBlock *BB : TheLoop->blocks()) { 6452 VectorizationCostTy BlockCost; 6453 6454 // For each instruction in the old loop. 6455 for (Instruction &I : BB->instructionsWithoutDebug()) { 6456 // Skip ignored values. 6457 if (ValuesToIgnore.count(&I) || 6458 (VF.isVector() && VecValuesToIgnore.count(&I))) 6459 continue; 6460 6461 VectorizationCostTy C = getInstructionCost(&I, VF); 6462 6463 // Check if we should override the cost. 6464 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6465 C.first = ForceTargetInstructionCost; 6466 6467 BlockCost.first += C.first; 6468 BlockCost.second |= C.second; 6469 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6470 << " for VF " << VF << " For instruction: " << I 6471 << '\n'); 6472 } 6473 6474 // If we are vectorizing a predicated block, it will have been 6475 // if-converted. This means that the block's instructions (aside from 6476 // stores and instructions that may divide by zero) will now be 6477 // unconditionally executed. For the scalar case, we may not always execute 6478 // the predicated block. Thus, scale the block's cost by the probability of 6479 // executing it. 6480 if (VF.isScalar() && blockNeedsPredication(BB)) 6481 BlockCost.first /= getReciprocalPredBlockProb(); 6482 6483 Cost.first += BlockCost.first; 6484 Cost.second |= BlockCost.second; 6485 } 6486 6487 return Cost; 6488 } 6489 6490 /// Gets Address Access SCEV after verifying that the access pattern 6491 /// is loop invariant except the induction variable dependence. 6492 /// 6493 /// This SCEV can be sent to the Target in order to estimate the address 6494 /// calculation cost. 6495 static const SCEV *getAddressAccessSCEV( 6496 Value *Ptr, 6497 LoopVectorizationLegality *Legal, 6498 PredicatedScalarEvolution &PSE, 6499 const Loop *TheLoop) { 6500 6501 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6502 if (!Gep) 6503 return nullptr; 6504 6505 // We are looking for a gep with all loop invariant indices except for one 6506 // which should be an induction variable. 6507 auto SE = PSE.getSE(); 6508 unsigned NumOperands = Gep->getNumOperands(); 6509 for (unsigned i = 1; i < NumOperands; ++i) { 6510 Value *Opd = Gep->getOperand(i); 6511 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6512 !Legal->isInductionVariable(Opd)) 6513 return nullptr; 6514 } 6515 6516 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6517 return PSE.getSCEV(Ptr); 6518 } 6519 6520 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6521 return Legal->hasStride(I->getOperand(0)) || 6522 Legal->hasStride(I->getOperand(1)); 6523 } 6524 6525 unsigned 6526 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6527 ElementCount VF) { 6528 assert(VF.isVector() && 6529 "Scalarization cost of instruction implies vectorization."); 6530 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6531 Type *ValTy = getMemInstValueType(I); 6532 auto SE = PSE.getSE(); 6533 6534 unsigned AS = getLoadStoreAddressSpace(I); 6535 Value *Ptr = getLoadStorePointerOperand(I); 6536 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6537 6538 // Figure out whether the access is strided and get the stride value 6539 // if it's known in compile time 6540 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6541 6542 // Get the cost of the scalar memory instruction and address computation. 6543 unsigned Cost = 6544 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6545 6546 // Don't pass *I here, since it is scalar but will actually be part of a 6547 // vectorized loop where the user of it is a vectorized instruction. 6548 const Align Alignment = getLoadStoreAlignment(I); 6549 Cost += VF.getKnownMinValue() * 6550 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6551 AS, TTI::TCK_RecipThroughput); 6552 6553 // Get the overhead of the extractelement and insertelement instructions 6554 // we might create due to scalarization. 6555 Cost += getScalarizationOverhead(I, VF); 6556 6557 // If we have a predicated store, it may not be executed for each vector 6558 // lane. Scale the cost by the probability of executing the predicated 6559 // block. 6560 if (isPredicatedInst(I)) { 6561 Cost /= getReciprocalPredBlockProb(); 6562 6563 if (useEmulatedMaskMemRefHack(I)) 6564 // Artificially setting to a high enough value to practically disable 6565 // vectorization with such operations. 6566 Cost = 3000000; 6567 } 6568 6569 return Cost; 6570 } 6571 6572 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6573 ElementCount VF) { 6574 Type *ValTy = getMemInstValueType(I); 6575 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6576 Value *Ptr = getLoadStorePointerOperand(I); 6577 unsigned AS = getLoadStoreAddressSpace(I); 6578 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6579 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6580 6581 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6582 "Stride should be 1 or -1 for consecutive memory access"); 6583 const Align Alignment = getLoadStoreAlignment(I); 6584 unsigned Cost = 0; 6585 if (Legal->isMaskRequired(I)) 6586 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6587 CostKind); 6588 else 6589 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6590 CostKind, I); 6591 6592 bool Reverse = ConsecutiveStride < 0; 6593 if (Reverse) 6594 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6595 return Cost; 6596 } 6597 6598 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6599 ElementCount VF) { 6600 assert(Legal->isUniformMemOp(*I)); 6601 6602 Type *ValTy = getMemInstValueType(I); 6603 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6604 const Align Alignment = getLoadStoreAlignment(I); 6605 unsigned AS = getLoadStoreAddressSpace(I); 6606 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6607 if (isa<LoadInst>(I)) { 6608 return TTI.getAddressComputationCost(ValTy) + 6609 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6610 CostKind) + 6611 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6612 } 6613 StoreInst *SI = cast<StoreInst>(I); 6614 6615 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6616 return TTI.getAddressComputationCost(ValTy) + 6617 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6618 CostKind) + 6619 (isLoopInvariantStoreValue 6620 ? 0 6621 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6622 VF.getKnownMinValue() - 1)); 6623 } 6624 6625 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6626 ElementCount VF) { 6627 Type *ValTy = getMemInstValueType(I); 6628 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6629 const Align Alignment = getLoadStoreAlignment(I); 6630 const Value *Ptr = getLoadStorePointerOperand(I); 6631 6632 return TTI.getAddressComputationCost(VectorTy) + 6633 TTI.getGatherScatterOpCost( 6634 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6635 TargetTransformInfo::TCK_RecipThroughput, I); 6636 } 6637 6638 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6639 ElementCount VF) { 6640 Type *ValTy = getMemInstValueType(I); 6641 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6642 unsigned AS = getLoadStoreAddressSpace(I); 6643 6644 auto Group = getInterleavedAccessGroup(I); 6645 assert(Group && "Fail to get an interleaved access group."); 6646 6647 unsigned InterleaveFactor = Group->getFactor(); 6648 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6649 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6650 6651 // Holds the indices of existing members in an interleaved load group. 6652 // An interleaved store group doesn't need this as it doesn't allow gaps. 6653 SmallVector<unsigned, 4> Indices; 6654 if (isa<LoadInst>(I)) { 6655 for (unsigned i = 0; i < InterleaveFactor; i++) 6656 if (Group->getMember(i)) 6657 Indices.push_back(i); 6658 } 6659 6660 // Calculate the cost of the whole interleaved group. 6661 bool UseMaskForGaps = 6662 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6663 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6664 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6665 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6666 6667 if (Group->isReverse()) { 6668 // TODO: Add support for reversed masked interleaved access. 6669 assert(!Legal->isMaskRequired(I) && 6670 "Reverse masked interleaved access not supported."); 6671 Cost += Group->getNumMembers() * 6672 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6673 } 6674 return Cost; 6675 } 6676 6677 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6678 ElementCount VF) { 6679 // Calculate scalar cost only. Vectorization cost should be ready at this 6680 // moment. 6681 if (VF.isScalar()) { 6682 Type *ValTy = getMemInstValueType(I); 6683 const Align Alignment = getLoadStoreAlignment(I); 6684 unsigned AS = getLoadStoreAddressSpace(I); 6685 6686 return TTI.getAddressComputationCost(ValTy) + 6687 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6688 TTI::TCK_RecipThroughput, I); 6689 } 6690 return getWideningCost(I, VF); 6691 } 6692 6693 LoopVectorizationCostModel::VectorizationCostTy 6694 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6695 ElementCount VF) { 6696 assert(!VF.isScalable() && 6697 "the cost model is not yet implemented for scalable vectorization"); 6698 // If we know that this instruction will remain uniform, check the cost of 6699 // the scalar version. 6700 if (isUniformAfterVectorization(I, VF)) 6701 VF = ElementCount::getFixed(1); 6702 6703 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6704 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6705 6706 // Forced scalars do not have any scalarization overhead. 6707 auto ForcedScalar = ForcedScalars.find(VF); 6708 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6709 auto InstSet = ForcedScalar->second; 6710 if (InstSet.count(I)) 6711 return VectorizationCostTy( 6712 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6713 VF.getKnownMinValue()), 6714 false); 6715 } 6716 6717 Type *VectorTy; 6718 unsigned C = getInstructionCost(I, VF, VectorTy); 6719 6720 bool TypeNotScalarized = 6721 VF.isVector() && VectorTy->isVectorTy() && 6722 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6723 return VectorizationCostTy(C, TypeNotScalarized); 6724 } 6725 6726 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6727 ElementCount VF) { 6728 6729 assert(!VF.isScalable() && 6730 "cannot compute scalarization overhead for scalable vectorization"); 6731 if (VF.isScalar()) 6732 return 0; 6733 6734 unsigned Cost = 0; 6735 Type *RetTy = ToVectorTy(I->getType(), VF); 6736 if (!RetTy->isVoidTy() && 6737 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6738 Cost += TTI.getScalarizationOverhead( 6739 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6740 true, false); 6741 6742 // Some targets keep addresses scalar. 6743 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6744 return Cost; 6745 6746 // Some targets support efficient element stores. 6747 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6748 return Cost; 6749 6750 // Collect operands to consider. 6751 CallInst *CI = dyn_cast<CallInst>(I); 6752 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6753 6754 // Skip operands that do not require extraction/scalarization and do not incur 6755 // any overhead. 6756 return Cost + TTI.getOperandsScalarizationOverhead( 6757 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6758 } 6759 6760 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6761 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6762 if (VF.isScalar()) 6763 return; 6764 NumPredStores = 0; 6765 for (BasicBlock *BB : TheLoop->blocks()) { 6766 // For each instruction in the old loop. 6767 for (Instruction &I : *BB) { 6768 Value *Ptr = getLoadStorePointerOperand(&I); 6769 if (!Ptr) 6770 continue; 6771 6772 // TODO: We should generate better code and update the cost model for 6773 // predicated uniform stores. Today they are treated as any other 6774 // predicated store (see added test cases in 6775 // invariant-store-vectorization.ll). 6776 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6777 NumPredStores++; 6778 6779 if (Legal->isUniformMemOp(I)) { 6780 // TODO: Avoid replicating loads and stores instead of 6781 // relying on instcombine to remove them. 6782 // Load: Scalar load + broadcast 6783 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6784 unsigned Cost = getUniformMemOpCost(&I, VF); 6785 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6786 continue; 6787 } 6788 6789 // We assume that widening is the best solution when possible. 6790 if (memoryInstructionCanBeWidened(&I, VF)) { 6791 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6792 int ConsecutiveStride = 6793 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6794 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6795 "Expected consecutive stride."); 6796 InstWidening Decision = 6797 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6798 setWideningDecision(&I, VF, Decision, Cost); 6799 continue; 6800 } 6801 6802 // Choose between Interleaving, Gather/Scatter or Scalarization. 6803 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6804 unsigned NumAccesses = 1; 6805 if (isAccessInterleaved(&I)) { 6806 auto Group = getInterleavedAccessGroup(&I); 6807 assert(Group && "Fail to get an interleaved access group."); 6808 6809 // Make one decision for the whole group. 6810 if (getWideningDecision(&I, VF) != CM_Unknown) 6811 continue; 6812 6813 NumAccesses = Group->getNumMembers(); 6814 if (interleavedAccessCanBeWidened(&I, VF)) 6815 InterleaveCost = getInterleaveGroupCost(&I, VF); 6816 } 6817 6818 unsigned GatherScatterCost = 6819 isLegalGatherOrScatter(&I) 6820 ? getGatherScatterCost(&I, VF) * NumAccesses 6821 : std::numeric_limits<unsigned>::max(); 6822 6823 unsigned ScalarizationCost = 6824 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6825 6826 // Choose better solution for the current VF, 6827 // write down this decision and use it during vectorization. 6828 unsigned Cost; 6829 InstWidening Decision; 6830 if (InterleaveCost <= GatherScatterCost && 6831 InterleaveCost < ScalarizationCost) { 6832 Decision = CM_Interleave; 6833 Cost = InterleaveCost; 6834 } else if (GatherScatterCost < ScalarizationCost) { 6835 Decision = CM_GatherScatter; 6836 Cost = GatherScatterCost; 6837 } else { 6838 Decision = CM_Scalarize; 6839 Cost = ScalarizationCost; 6840 } 6841 // If the instructions belongs to an interleave group, the whole group 6842 // receives the same decision. The whole group receives the cost, but 6843 // the cost will actually be assigned to one instruction. 6844 if (auto Group = getInterleavedAccessGroup(&I)) 6845 setWideningDecision(Group, VF, Decision, Cost); 6846 else 6847 setWideningDecision(&I, VF, Decision, Cost); 6848 } 6849 } 6850 6851 // Make sure that any load of address and any other address computation 6852 // remains scalar unless there is gather/scatter support. This avoids 6853 // inevitable extracts into address registers, and also has the benefit of 6854 // activating LSR more, since that pass can't optimize vectorized 6855 // addresses. 6856 if (TTI.prefersVectorizedAddressing()) 6857 return; 6858 6859 // Start with all scalar pointer uses. 6860 SmallPtrSet<Instruction *, 8> AddrDefs; 6861 for (BasicBlock *BB : TheLoop->blocks()) 6862 for (Instruction &I : *BB) { 6863 Instruction *PtrDef = 6864 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6865 if (PtrDef && TheLoop->contains(PtrDef) && 6866 getWideningDecision(&I, VF) != CM_GatherScatter) 6867 AddrDefs.insert(PtrDef); 6868 } 6869 6870 // Add all instructions used to generate the addresses. 6871 SmallVector<Instruction *, 4> Worklist; 6872 for (auto *I : AddrDefs) 6873 Worklist.push_back(I); 6874 while (!Worklist.empty()) { 6875 Instruction *I = Worklist.pop_back_val(); 6876 for (auto &Op : I->operands()) 6877 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6878 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6879 AddrDefs.insert(InstOp).second) 6880 Worklist.push_back(InstOp); 6881 } 6882 6883 for (auto *I : AddrDefs) { 6884 if (isa<LoadInst>(I)) { 6885 // Setting the desired widening decision should ideally be handled in 6886 // by cost functions, but since this involves the task of finding out 6887 // if the loaded register is involved in an address computation, it is 6888 // instead changed here when we know this is the case. 6889 InstWidening Decision = getWideningDecision(I, VF); 6890 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6891 // Scalarize a widened load of address. 6892 setWideningDecision( 6893 I, VF, CM_Scalarize, 6894 (VF.getKnownMinValue() * 6895 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6896 else if (auto Group = getInterleavedAccessGroup(I)) { 6897 // Scalarize an interleave group of address loads. 6898 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6899 if (Instruction *Member = Group->getMember(I)) 6900 setWideningDecision( 6901 Member, VF, CM_Scalarize, 6902 (VF.getKnownMinValue() * 6903 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6904 } 6905 } 6906 } else 6907 // Make sure I gets scalarized and a cost estimate without 6908 // scalarization overhead. 6909 ForcedScalars[VF].insert(I); 6910 } 6911 } 6912 6913 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6914 ElementCount VF, 6915 Type *&VectorTy) { 6916 Type *RetTy = I->getType(); 6917 if (canTruncateToMinimalBitwidth(I, VF)) 6918 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6919 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6920 auto SE = PSE.getSE(); 6921 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6922 6923 // TODO: We need to estimate the cost of intrinsic calls. 6924 switch (I->getOpcode()) { 6925 case Instruction::GetElementPtr: 6926 // We mark this instruction as zero-cost because the cost of GEPs in 6927 // vectorized code depends on whether the corresponding memory instruction 6928 // is scalarized or not. Therefore, we handle GEPs with the memory 6929 // instruction cost. 6930 return 0; 6931 case Instruction::Br: { 6932 // In cases of scalarized and predicated instructions, there will be VF 6933 // predicated blocks in the vectorized loop. Each branch around these 6934 // blocks requires also an extract of its vector compare i1 element. 6935 bool ScalarPredicatedBB = false; 6936 BranchInst *BI = cast<BranchInst>(I); 6937 if (VF.isVector() && BI->isConditional() && 6938 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6939 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6940 ScalarPredicatedBB = true; 6941 6942 if (ScalarPredicatedBB) { 6943 // Return cost for branches around scalarized and predicated blocks. 6944 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6945 auto *Vec_i1Ty = 6946 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6947 return (TTI.getScalarizationOverhead( 6948 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6949 false, true) + 6950 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6951 VF.getKnownMinValue())); 6952 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6953 // The back-edge branch will remain, as will all scalar branches. 6954 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6955 else 6956 // This branch will be eliminated by if-conversion. 6957 return 0; 6958 // Note: We currently assume zero cost for an unconditional branch inside 6959 // a predicated block since it will become a fall-through, although we 6960 // may decide in the future to call TTI for all branches. 6961 } 6962 case Instruction::PHI: { 6963 auto *Phi = cast<PHINode>(I); 6964 6965 // First-order recurrences are replaced by vector shuffles inside the loop. 6966 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6967 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6968 return TTI.getShuffleCost( 6969 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6970 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6971 6972 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6973 // converted into select instructions. We require N - 1 selects per phi 6974 // node, where N is the number of incoming values. 6975 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6976 return (Phi->getNumIncomingValues() - 1) * 6977 TTI.getCmpSelInstrCost( 6978 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6979 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6980 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6981 6982 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6983 } 6984 case Instruction::UDiv: 6985 case Instruction::SDiv: 6986 case Instruction::URem: 6987 case Instruction::SRem: 6988 // If we have a predicated instruction, it may not be executed for each 6989 // vector lane. Get the scalarization cost and scale this amount by the 6990 // probability of executing the predicated block. If the instruction is not 6991 // predicated, we fall through to the next case. 6992 if (VF.isVector() && isScalarWithPredication(I)) { 6993 unsigned Cost = 0; 6994 6995 // These instructions have a non-void type, so account for the phi nodes 6996 // that we will create. This cost is likely to be zero. The phi node 6997 // cost, if any, should be scaled by the block probability because it 6998 // models a copy at the end of each predicated block. 6999 Cost += VF.getKnownMinValue() * 7000 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7001 7002 // The cost of the non-predicated instruction. 7003 Cost += VF.getKnownMinValue() * 7004 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7005 7006 // The cost of insertelement and extractelement instructions needed for 7007 // scalarization. 7008 Cost += getScalarizationOverhead(I, VF); 7009 7010 // Scale the cost by the probability of executing the predicated blocks. 7011 // This assumes the predicated block for each vector lane is equally 7012 // likely. 7013 return Cost / getReciprocalPredBlockProb(); 7014 } 7015 LLVM_FALLTHROUGH; 7016 case Instruction::Add: 7017 case Instruction::FAdd: 7018 case Instruction::Sub: 7019 case Instruction::FSub: 7020 case Instruction::Mul: 7021 case Instruction::FMul: 7022 case Instruction::FDiv: 7023 case Instruction::FRem: 7024 case Instruction::Shl: 7025 case Instruction::LShr: 7026 case Instruction::AShr: 7027 case Instruction::And: 7028 case Instruction::Or: 7029 case Instruction::Xor: { 7030 // Since we will replace the stride by 1 the multiplication should go away. 7031 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7032 return 0; 7033 // Certain instructions can be cheaper to vectorize if they have a constant 7034 // second vector operand. One example of this are shifts on x86. 7035 Value *Op2 = I->getOperand(1); 7036 TargetTransformInfo::OperandValueProperties Op2VP; 7037 TargetTransformInfo::OperandValueKind Op2VK = 7038 TTI.getOperandInfo(Op2, Op2VP); 7039 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7040 Op2VK = TargetTransformInfo::OK_UniformValue; 7041 7042 SmallVector<const Value *, 4> Operands(I->operand_values()); 7043 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7044 return N * TTI.getArithmeticInstrCost( 7045 I->getOpcode(), VectorTy, CostKind, 7046 TargetTransformInfo::OK_AnyValue, 7047 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7048 } 7049 case Instruction::FNeg: { 7050 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7051 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7052 return N * TTI.getArithmeticInstrCost( 7053 I->getOpcode(), VectorTy, CostKind, 7054 TargetTransformInfo::OK_AnyValue, 7055 TargetTransformInfo::OK_AnyValue, 7056 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7057 I->getOperand(0), I); 7058 } 7059 case Instruction::Select: { 7060 SelectInst *SI = cast<SelectInst>(I); 7061 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7062 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7063 Type *CondTy = SI->getCondition()->getType(); 7064 if (!ScalarCond) { 7065 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7066 CondTy = VectorType::get(CondTy, VF); 7067 } 7068 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7069 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7070 } 7071 case Instruction::ICmp: 7072 case Instruction::FCmp: { 7073 Type *ValTy = I->getOperand(0)->getType(); 7074 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7075 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7076 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7077 VectorTy = ToVectorTy(ValTy, VF); 7078 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7079 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7080 } 7081 case Instruction::Store: 7082 case Instruction::Load: { 7083 ElementCount Width = VF; 7084 if (Width.isVector()) { 7085 InstWidening Decision = getWideningDecision(I, Width); 7086 assert(Decision != CM_Unknown && 7087 "CM decision should be taken at this point"); 7088 if (Decision == CM_Scalarize) 7089 Width = ElementCount::getFixed(1); 7090 } 7091 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7092 return getMemoryInstructionCost(I, VF); 7093 } 7094 case Instruction::ZExt: 7095 case Instruction::SExt: 7096 case Instruction::FPToUI: 7097 case Instruction::FPToSI: 7098 case Instruction::FPExt: 7099 case Instruction::PtrToInt: 7100 case Instruction::IntToPtr: 7101 case Instruction::SIToFP: 7102 case Instruction::UIToFP: 7103 case Instruction::Trunc: 7104 case Instruction::FPTrunc: 7105 case Instruction::BitCast: { 7106 // Computes the CastContextHint from a Load/Store instruction. 7107 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7108 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7109 "Expected a load or a store!"); 7110 7111 if (VF.isScalar() || !TheLoop->contains(I)) 7112 return TTI::CastContextHint::Normal; 7113 7114 switch (getWideningDecision(I, VF)) { 7115 case LoopVectorizationCostModel::CM_GatherScatter: 7116 return TTI::CastContextHint::GatherScatter; 7117 case LoopVectorizationCostModel::CM_Interleave: 7118 return TTI::CastContextHint::Interleave; 7119 case LoopVectorizationCostModel::CM_Scalarize: 7120 case LoopVectorizationCostModel::CM_Widen: 7121 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7122 : TTI::CastContextHint::Normal; 7123 case LoopVectorizationCostModel::CM_Widen_Reverse: 7124 return TTI::CastContextHint::Reversed; 7125 case LoopVectorizationCostModel::CM_Unknown: 7126 llvm_unreachable("Instr did not go through cost modelling?"); 7127 } 7128 7129 llvm_unreachable("Unhandled case!"); 7130 }; 7131 7132 unsigned Opcode = I->getOpcode(); 7133 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7134 // For Trunc, the context is the only user, which must be a StoreInst. 7135 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7136 if (I->hasOneUse()) 7137 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7138 CCH = ComputeCCH(Store); 7139 } 7140 // For Z/Sext, the context is the operand, which must be a LoadInst. 7141 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7142 Opcode == Instruction::FPExt) { 7143 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7144 CCH = ComputeCCH(Load); 7145 } 7146 7147 // We optimize the truncation of induction variables having constant 7148 // integer steps. The cost of these truncations is the same as the scalar 7149 // operation. 7150 if (isOptimizableIVTruncate(I, VF)) { 7151 auto *Trunc = cast<TruncInst>(I); 7152 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7153 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7154 } 7155 7156 Type *SrcScalarTy = I->getOperand(0)->getType(); 7157 Type *SrcVecTy = 7158 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7159 if (canTruncateToMinimalBitwidth(I, VF)) { 7160 // This cast is going to be shrunk. This may remove the cast or it might 7161 // turn it into slightly different cast. For example, if MinBW == 16, 7162 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7163 // 7164 // Calculate the modified src and dest types. 7165 Type *MinVecTy = VectorTy; 7166 if (Opcode == Instruction::Trunc) { 7167 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7168 VectorTy = 7169 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7170 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7171 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7172 VectorTy = 7173 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7174 } 7175 } 7176 7177 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7178 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7179 return N * 7180 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7181 } 7182 case Instruction::Call: { 7183 bool NeedToScalarize; 7184 CallInst *CI = cast<CallInst>(I); 7185 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7186 if (getVectorIntrinsicIDForCall(CI, TLI)) 7187 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7188 return CallCost; 7189 } 7190 case Instruction::ExtractValue: 7191 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7192 default: 7193 // The cost of executing VF copies of the scalar instruction. This opcode 7194 // is unknown. Assume that it is the same as 'mul'. 7195 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7196 Instruction::Mul, VectorTy, CostKind) + 7197 getScalarizationOverhead(I, VF); 7198 } // end of switch. 7199 } 7200 7201 char LoopVectorize::ID = 0; 7202 7203 static const char lv_name[] = "Loop Vectorization"; 7204 7205 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7206 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7207 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7208 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7209 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7210 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7211 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7212 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7213 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7214 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7215 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7216 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7217 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7218 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7219 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7220 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7221 7222 namespace llvm { 7223 7224 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7225 7226 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7227 bool VectorizeOnlyWhenForced) { 7228 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7229 } 7230 7231 } // end namespace llvm 7232 7233 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7234 // Check if the pointer operand of a load or store instruction is 7235 // consecutive. 7236 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7237 return Legal->isConsecutivePtr(Ptr); 7238 return false; 7239 } 7240 7241 void LoopVectorizationCostModel::collectValuesToIgnore() { 7242 // Ignore ephemeral values. 7243 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7244 7245 // Ignore type-promoting instructions we identified during reduction 7246 // detection. 7247 for (auto &Reduction : Legal->getReductionVars()) { 7248 RecurrenceDescriptor &RedDes = Reduction.second; 7249 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7250 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7251 } 7252 // Ignore type-casting instructions we identified during induction 7253 // detection. 7254 for (auto &Induction : Legal->getInductionVars()) { 7255 InductionDescriptor &IndDes = Induction.second; 7256 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7257 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7258 } 7259 } 7260 7261 void LoopVectorizationCostModel::collectInLoopReductions() { 7262 for (auto &Reduction : Legal->getReductionVars()) { 7263 PHINode *Phi = Reduction.first; 7264 RecurrenceDescriptor &RdxDesc = Reduction.second; 7265 7266 // We don't collect reductions that are type promoted (yet). 7267 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7268 continue; 7269 7270 // If the target would prefer this reduction to happen "in-loop", then we 7271 // want to record it as such. 7272 unsigned Opcode = RdxDesc.getRecurrenceBinOp(); 7273 if (!PreferInLoopReductions && 7274 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7275 TargetTransformInfo::ReductionFlags())) 7276 continue; 7277 7278 // Check that we can correctly put the reductions into the loop, by 7279 // finding the chain of operations that leads from the phi to the loop 7280 // exit value. 7281 SmallVector<Instruction *, 4> ReductionOperations = 7282 RdxDesc.getReductionOpChain(Phi, TheLoop); 7283 bool InLoop = !ReductionOperations.empty(); 7284 if (InLoop) 7285 InLoopReductionChains[Phi] = ReductionOperations; 7286 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7287 << " reduction for phi: " << *Phi << "\n"); 7288 } 7289 } 7290 7291 // TODO: we could return a pair of values that specify the max VF and 7292 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7293 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7294 // doesn't have a cost model that can choose which plan to execute if 7295 // more than one is generated. 7296 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7297 LoopVectorizationCostModel &CM) { 7298 unsigned WidestType; 7299 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7300 return WidestVectorRegBits / WidestType; 7301 } 7302 7303 VectorizationFactor 7304 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7305 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7306 ElementCount VF = UserVF; 7307 // Outer loop handling: They may require CFG and instruction level 7308 // transformations before even evaluating whether vectorization is profitable. 7309 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7310 // the vectorization pipeline. 7311 if (!OrigLoop->isInnermost()) { 7312 // If the user doesn't provide a vectorization factor, determine a 7313 // reasonable one. 7314 if (UserVF.isZero()) { 7315 VF = ElementCount::getFixed( 7316 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7317 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7318 7319 // Make sure we have a VF > 1 for stress testing. 7320 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7321 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7322 << "overriding computed VF.\n"); 7323 VF = ElementCount::getFixed(4); 7324 } 7325 } 7326 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7327 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7328 "VF needs to be a power of two"); 7329 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7330 << "VF " << VF << " to build VPlans.\n"); 7331 buildVPlans(VF, VF); 7332 7333 // For VPlan build stress testing, we bail out after VPlan construction. 7334 if (VPlanBuildStressTest) 7335 return VectorizationFactor::Disabled(); 7336 7337 return {VF, 0 /*Cost*/}; 7338 } 7339 7340 LLVM_DEBUG( 7341 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7342 "VPlan-native path.\n"); 7343 return VectorizationFactor::Disabled(); 7344 } 7345 7346 Optional<VectorizationFactor> 7347 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7348 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 7349 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7350 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7351 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7352 return None; 7353 7354 // Invalidate interleave groups if all blocks of loop will be predicated. 7355 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7356 !useMaskedInterleavedAccesses(*TTI)) { 7357 LLVM_DEBUG( 7358 dbgs() 7359 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7360 "which requires masked-interleaved support.\n"); 7361 if (CM.InterleaveInfo.invalidateGroups()) 7362 // Invalidating interleave groups also requires invalidating all decisions 7363 // based on them, which includes widening decisions and uniform and scalar 7364 // values. 7365 CM.invalidateCostModelingDecisions(); 7366 } 7367 7368 ElementCount MaxVF = MaybeMaxVF.getValue(); 7369 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7370 7371 if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) { 7372 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7373 assert(isPowerOf2_32(UserVF.getFixedValue()) && 7374 "VF needs to be a power of two"); 7375 // Collect the instructions (and their associated costs) that will be more 7376 // profitable to scalarize. 7377 CM.selectUserVectorizationFactor(UserVF); 7378 CM.collectInLoopReductions(); 7379 buildVPlansWithVPRecipes(UserVF, UserVF); 7380 LLVM_DEBUG(printPlans(dbgs())); 7381 return {{UserVF, 0}}; 7382 } 7383 7384 for (ElementCount VF = ElementCount::getFixed(1); 7385 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7386 // Collect Uniform and Scalar instructions after vectorization with VF. 7387 CM.collectUniformsAndScalars(VF); 7388 7389 // Collect the instructions (and their associated costs) that will be more 7390 // profitable to scalarize. 7391 if (VF.isVector()) 7392 CM.collectInstsToScalarize(VF); 7393 } 7394 7395 CM.collectInLoopReductions(); 7396 7397 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7398 LLVM_DEBUG(printPlans(dbgs())); 7399 if (MaxVF.isScalar()) 7400 return VectorizationFactor::Disabled(); 7401 7402 // Select the optimal vectorization factor. 7403 return CM.selectVectorizationFactor(MaxVF); 7404 } 7405 7406 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7407 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7408 << '\n'); 7409 BestVF = VF; 7410 BestUF = UF; 7411 7412 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7413 return !Plan->hasVF(VF); 7414 }); 7415 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7416 } 7417 7418 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7419 DominatorTree *DT) { 7420 // Perform the actual loop transformation. 7421 7422 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7423 VPCallbackILV CallbackILV(ILV); 7424 7425 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7426 7427 VPTransformState State{*BestVF, BestUF, LI, 7428 DT, ILV.Builder, ILV.VectorLoopValueMap, 7429 &ILV, CallbackILV}; 7430 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7431 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7432 State.CanonicalIV = ILV.Induction; 7433 7434 ILV.printDebugTracesAtStart(); 7435 7436 //===------------------------------------------------===// 7437 // 7438 // Notice: any optimization or new instruction that go 7439 // into the code below should also be implemented in 7440 // the cost-model. 7441 // 7442 //===------------------------------------------------===// 7443 7444 // 2. Copy and widen instructions from the old loop into the new loop. 7445 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7446 VPlans.front()->execute(&State); 7447 7448 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7449 // predication, updating analyses. 7450 ILV.fixVectorizedLoop(); 7451 7452 ILV.printDebugTracesAtEnd(); 7453 } 7454 7455 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7456 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7457 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7458 7459 // We create new control-flow for the vectorized loop, so the original 7460 // condition will be dead after vectorization if it's only used by the 7461 // branch. 7462 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7463 if (Cmp && Cmp->hasOneUse()) { 7464 DeadInstructions.insert(Cmp); 7465 7466 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7467 for (Value *Op : Cmp->operands()) { 7468 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7469 DeadInstructions.insert(cast<Instruction>(Op)); 7470 } 7471 } 7472 7473 // We create new "steps" for induction variable updates to which the original 7474 // induction variables map. An original update instruction will be dead if 7475 // all its users except the induction variable are dead. 7476 for (auto &Induction : Legal->getInductionVars()) { 7477 PHINode *Ind = Induction.first; 7478 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7479 7480 // If the tail is to be folded by masking, the primary induction variable, 7481 // if exists, isn't dead: it will be used for masking. Don't kill it. 7482 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7483 continue; 7484 7485 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7486 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7487 })) 7488 DeadInstructions.insert(IndUpdate); 7489 7490 // We record as "Dead" also the type-casting instructions we had identified 7491 // during induction analysis. We don't need any handling for them in the 7492 // vectorized loop because we have proven that, under a proper runtime 7493 // test guarding the vectorized loop, the value of the phi, and the casted 7494 // value of the phi, are the same. The last instruction in this casting chain 7495 // will get its scalar/vector/widened def from the scalar/vector/widened def 7496 // of the respective phi node. Any other casts in the induction def-use chain 7497 // have no other uses outside the phi update chain, and will be ignored. 7498 InductionDescriptor &IndDes = Induction.second; 7499 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7500 DeadInstructions.insert(Casts.begin(), Casts.end()); 7501 } 7502 } 7503 7504 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7505 7506 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7507 7508 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7509 Instruction::BinaryOps BinOp) { 7510 // When unrolling and the VF is 1, we only need to add a simple scalar. 7511 Type *Ty = Val->getType(); 7512 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7513 7514 if (Ty->isFloatingPointTy()) { 7515 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7516 7517 // Floating point operations had to be 'fast' to enable the unrolling. 7518 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7519 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7520 } 7521 Constant *C = ConstantInt::get(Ty, StartIdx); 7522 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7523 } 7524 7525 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7526 SmallVector<Metadata *, 4> MDs; 7527 // Reserve first location for self reference to the LoopID metadata node. 7528 MDs.push_back(nullptr); 7529 bool IsUnrollMetadata = false; 7530 MDNode *LoopID = L->getLoopID(); 7531 if (LoopID) { 7532 // First find existing loop unrolling disable metadata. 7533 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7534 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7535 if (MD) { 7536 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7537 IsUnrollMetadata = 7538 S && S->getString().startswith("llvm.loop.unroll.disable"); 7539 } 7540 MDs.push_back(LoopID->getOperand(i)); 7541 } 7542 } 7543 7544 if (!IsUnrollMetadata) { 7545 // Add runtime unroll disable metadata. 7546 LLVMContext &Context = L->getHeader()->getContext(); 7547 SmallVector<Metadata *, 1> DisableOperands; 7548 DisableOperands.push_back( 7549 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7550 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7551 MDs.push_back(DisableNode); 7552 MDNode *NewLoopID = MDNode::get(Context, MDs); 7553 // Set operand 0 to refer to the loop id itself. 7554 NewLoopID->replaceOperandWith(0, NewLoopID); 7555 L->setLoopID(NewLoopID); 7556 } 7557 } 7558 7559 //===--------------------------------------------------------------------===// 7560 // EpilogueVectorizerMainLoop 7561 //===--------------------------------------------------------------------===// 7562 7563 /// This function is partially responsible for generating the control flow 7564 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7565 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7566 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7567 Loop *Lp = createVectorLoopSkeleton(""); 7568 7569 // Generate the code to check the minimum iteration count of the vector 7570 // epilogue (see below). 7571 EPI.EpilogueIterationCountCheck = 7572 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7573 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7574 7575 // Generate the code to check any assumptions that we've made for SCEV 7576 // expressions. 7577 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7578 emitSCEVChecks(Lp, LoopScalarPreHeader); 7579 7580 // If a safety check was generated save it. 7581 if (SavedPreHeader != LoopVectorPreHeader) 7582 EPI.SCEVSafetyCheck = SavedPreHeader; 7583 7584 // Generate the code that checks at runtime if arrays overlap. We put the 7585 // checks into a separate block to make the more common case of few elements 7586 // faster. 7587 SavedPreHeader = LoopVectorPreHeader; 7588 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7589 7590 // If a safety check was generated save/overwite it. 7591 if (SavedPreHeader != LoopVectorPreHeader) 7592 EPI.MemSafetyCheck = SavedPreHeader; 7593 7594 // Generate the iteration count check for the main loop, *after* the check 7595 // for the epilogue loop, so that the path-length is shorter for the case 7596 // that goes directly through the vector epilogue. The longer-path length for 7597 // the main loop is compensated for, by the gain from vectorizing the larger 7598 // trip count. Note: the branch will get updated later on when we vectorize 7599 // the epilogue. 7600 EPI.MainLoopIterationCountCheck = 7601 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7602 7603 // Generate the induction variable. 7604 OldInduction = Legal->getPrimaryInduction(); 7605 Type *IdxTy = Legal->getWidestInductionType(); 7606 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7607 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7608 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7609 EPI.VectorTripCount = CountRoundDown; 7610 Induction = 7611 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7612 getDebugLocFromInstOrOperands(OldInduction)); 7613 7614 // Skip induction resume value creation here because they will be created in 7615 // the second pass. If we created them here, they wouldn't be used anyway, 7616 // because the vplan in the second pass still contains the inductions from the 7617 // original loop. 7618 7619 return completeLoopSkeleton(Lp, OrigLoopID); 7620 } 7621 7622 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7623 LLVM_DEBUG({ 7624 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7625 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7626 << ", Main Loop UF:" << EPI.MainLoopUF 7627 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7628 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7629 }); 7630 } 7631 7632 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7633 DEBUG_WITH_TYPE(VerboseDebug, { 7634 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7635 }); 7636 } 7637 7638 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7639 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7640 assert(L && "Expected valid Loop."); 7641 assert(Bypass && "Expected valid bypass basic block."); 7642 unsigned VFactor = 7643 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7644 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7645 Value *Count = getOrCreateTripCount(L); 7646 // Reuse existing vector loop preheader for TC checks. 7647 // Note that new preheader block is generated for vector loop. 7648 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7649 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7650 7651 // Generate code to check if the loop's trip count is less than VF * UF of the 7652 // main vector loop. 7653 auto P = 7654 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7655 7656 Value *CheckMinIters = Builder.CreateICmp( 7657 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7658 "min.iters.check"); 7659 7660 if (!ForEpilogue) 7661 TCCheckBlock->setName("vector.main.loop.iter.check"); 7662 7663 // Create new preheader for vector loop. 7664 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7665 DT, LI, nullptr, "vector.ph"); 7666 7667 if (ForEpilogue) { 7668 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7669 DT->getNode(Bypass)->getIDom()) && 7670 "TC check is expected to dominate Bypass"); 7671 7672 // Update dominator for Bypass & LoopExit. 7673 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7674 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7675 7676 LoopBypassBlocks.push_back(TCCheckBlock); 7677 7678 // Save the trip count so we don't have to regenerate it in the 7679 // vec.epilog.iter.check. This is safe to do because the trip count 7680 // generated here dominates the vector epilog iter check. 7681 EPI.TripCount = Count; 7682 } 7683 7684 ReplaceInstWithInst( 7685 TCCheckBlock->getTerminator(), 7686 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7687 7688 return TCCheckBlock; 7689 } 7690 7691 //===--------------------------------------------------------------------===// 7692 // EpilogueVectorizerEpilogueLoop 7693 //===--------------------------------------------------------------------===// 7694 7695 /// This function is partially responsible for generating the control flow 7696 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7697 BasicBlock * 7698 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7699 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7700 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7701 7702 // Now, compare the remaining count and if there aren't enough iterations to 7703 // execute the vectorized epilogue skip to the scalar part. 7704 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7705 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7706 LoopVectorPreHeader = 7707 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7708 LI, nullptr, "vec.epilog.ph"); 7709 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7710 VecEpilogueIterationCountCheck); 7711 7712 // Adjust the control flow taking the state info from the main loop 7713 // vectorization into account. 7714 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7715 "expected this to be saved from the previous pass."); 7716 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7717 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7718 7719 DT->changeImmediateDominator(LoopVectorPreHeader, 7720 EPI.MainLoopIterationCountCheck); 7721 7722 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7723 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7724 7725 if (EPI.SCEVSafetyCheck) 7726 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7727 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7728 if (EPI.MemSafetyCheck) 7729 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7730 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7731 7732 DT->changeImmediateDominator( 7733 VecEpilogueIterationCountCheck, 7734 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7735 7736 DT->changeImmediateDominator(LoopScalarPreHeader, 7737 EPI.EpilogueIterationCountCheck); 7738 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7739 7740 // Keep track of bypass blocks, as they feed start values to the induction 7741 // phis in the scalar loop preheader. 7742 if (EPI.SCEVSafetyCheck) 7743 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7744 if (EPI.MemSafetyCheck) 7745 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7746 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7747 7748 // Generate a resume induction for the vector epilogue and put it in the 7749 // vector epilogue preheader 7750 Type *IdxTy = Legal->getWidestInductionType(); 7751 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7752 LoopVectorPreHeader->getFirstNonPHI()); 7753 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7754 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7755 EPI.MainLoopIterationCountCheck); 7756 7757 // Generate the induction variable. 7758 OldInduction = Legal->getPrimaryInduction(); 7759 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7760 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7761 Value *StartIdx = EPResumeVal; 7762 Induction = 7763 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7764 getDebugLocFromInstOrOperands(OldInduction)); 7765 7766 // Generate induction resume values. These variables save the new starting 7767 // indexes for the scalar loop. They are used to test if there are any tail 7768 // iterations left once the vector loop has completed. 7769 // Note that when the vectorized epilogue is skipped due to iteration count 7770 // check, then the resume value for the induction variable comes from 7771 // the trip count of the main vector loop, hence passing the AdditionalBypass 7772 // argument. 7773 createInductionResumeValues(Lp, CountRoundDown, 7774 {VecEpilogueIterationCountCheck, 7775 EPI.VectorTripCount} /* AdditionalBypass */); 7776 7777 AddRuntimeUnrollDisableMetaData(Lp); 7778 return completeLoopSkeleton(Lp, OrigLoopID); 7779 } 7780 7781 BasicBlock * 7782 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7783 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7784 7785 assert(EPI.TripCount && 7786 "Expected trip count to have been safed in the first pass."); 7787 assert( 7788 (!isa<Instruction>(EPI.TripCount) || 7789 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7790 "saved trip count does not dominate insertion point."); 7791 Value *TC = EPI.TripCount; 7792 IRBuilder<> Builder(Insert->getTerminator()); 7793 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7794 7795 // Generate code to check if the loop's trip count is less than VF * UF of the 7796 // vector epilogue loop. 7797 auto P = 7798 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7799 7800 Value *CheckMinIters = Builder.CreateICmp( 7801 P, Count, 7802 ConstantInt::get(Count->getType(), 7803 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7804 "min.epilog.iters.check"); 7805 7806 ReplaceInstWithInst( 7807 Insert->getTerminator(), 7808 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7809 7810 LoopBypassBlocks.push_back(Insert); 7811 return Insert; 7812 } 7813 7814 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7815 LLVM_DEBUG({ 7816 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7817 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7818 << ", Main Loop UF:" << EPI.MainLoopUF 7819 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7820 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7821 }); 7822 } 7823 7824 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7825 DEBUG_WITH_TYPE(VerboseDebug, { 7826 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7827 }); 7828 } 7829 7830 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7831 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7832 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7833 bool PredicateAtRangeStart = Predicate(Range.Start); 7834 7835 for (ElementCount TmpVF = Range.Start * 2; 7836 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7837 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7838 Range.End = TmpVF; 7839 break; 7840 } 7841 7842 return PredicateAtRangeStart; 7843 } 7844 7845 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7846 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7847 /// of VF's starting at a given VF and extending it as much as possible. Each 7848 /// vectorization decision can potentially shorten this sub-range during 7849 /// buildVPlan(). 7850 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7851 ElementCount MaxVF) { 7852 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7853 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7854 VFRange SubRange = {VF, MaxVFPlusOne}; 7855 VPlans.push_back(buildVPlan(SubRange)); 7856 VF = SubRange.End; 7857 } 7858 } 7859 7860 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7861 VPlanPtr &Plan) { 7862 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7863 7864 // Look for cached value. 7865 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7866 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7867 if (ECEntryIt != EdgeMaskCache.end()) 7868 return ECEntryIt->second; 7869 7870 VPValue *SrcMask = createBlockInMask(Src, Plan); 7871 7872 // The terminator has to be a branch inst! 7873 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7874 assert(BI && "Unexpected terminator found"); 7875 7876 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7877 return EdgeMaskCache[Edge] = SrcMask; 7878 7879 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7880 assert(EdgeMask && "No Edge Mask found for condition"); 7881 7882 if (BI->getSuccessor(0) != Dst) 7883 EdgeMask = Builder.createNot(EdgeMask); 7884 7885 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7886 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7887 7888 return EdgeMaskCache[Edge] = EdgeMask; 7889 } 7890 7891 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7892 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7893 7894 // Look for cached value. 7895 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7896 if (BCEntryIt != BlockMaskCache.end()) 7897 return BCEntryIt->second; 7898 7899 // All-one mask is modelled as no-mask following the convention for masked 7900 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7901 VPValue *BlockMask = nullptr; 7902 7903 if (OrigLoop->getHeader() == BB) { 7904 if (!CM.blockNeedsPredication(BB)) 7905 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7906 7907 // Create the block in mask as the first non-phi instruction in the block. 7908 VPBuilder::InsertPointGuard Guard(Builder); 7909 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7910 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7911 7912 // Introduce the early-exit compare IV <= BTC to form header block mask. 7913 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7914 // Start by constructing the desired canonical IV. 7915 VPValue *IV = nullptr; 7916 if (Legal->getPrimaryInduction()) 7917 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7918 else { 7919 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7920 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7921 IV = IVRecipe->getVPValue(); 7922 } 7923 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7924 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7925 7926 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7927 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7928 // as a second argument, we only pass the IV here and extract the 7929 // tripcount from the transform state where codegen of the VP instructions 7930 // happen. 7931 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7932 } else { 7933 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7934 } 7935 return BlockMaskCache[BB] = BlockMask; 7936 } 7937 7938 // This is the block mask. We OR all incoming edges. 7939 for (auto *Predecessor : predecessors(BB)) { 7940 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7941 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7942 return BlockMaskCache[BB] = EdgeMask; 7943 7944 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7945 BlockMask = EdgeMask; 7946 continue; 7947 } 7948 7949 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7950 } 7951 7952 return BlockMaskCache[BB] = BlockMask; 7953 } 7954 7955 VPWidenMemoryInstructionRecipe * 7956 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7957 VPlanPtr &Plan) { 7958 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7959 "Must be called with either a load or store"); 7960 7961 auto willWiden = [&](ElementCount VF) -> bool { 7962 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7963 if (VF.isScalar()) 7964 return false; 7965 LoopVectorizationCostModel::InstWidening Decision = 7966 CM.getWideningDecision(I, VF); 7967 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7968 "CM decision should be taken at this point."); 7969 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7970 return true; 7971 if (CM.isScalarAfterVectorization(I, VF) || 7972 CM.isProfitableToScalarize(I, VF)) 7973 return false; 7974 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7975 }; 7976 7977 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7978 return nullptr; 7979 7980 VPValue *Mask = nullptr; 7981 if (Legal->isMaskRequired(I)) 7982 Mask = createBlockInMask(I->getParent(), Plan); 7983 7984 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7985 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7986 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7987 7988 StoreInst *Store = cast<StoreInst>(I); 7989 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7990 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7991 } 7992 7993 VPWidenIntOrFpInductionRecipe * 7994 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7995 // Check if this is an integer or fp induction. If so, build the recipe that 7996 // produces its scalar and vector values. 7997 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7998 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7999 II.getKind() == InductionDescriptor::IK_FpInduction) 8000 return new VPWidenIntOrFpInductionRecipe(Phi); 8001 8002 return nullptr; 8003 } 8004 8005 VPWidenIntOrFpInductionRecipe * 8006 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 8007 VFRange &Range) const { 8008 // Optimize the special case where the source is a constant integer 8009 // induction variable. Notice that we can only optimize the 'trunc' case 8010 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8011 // (c) other casts depend on pointer size. 8012 8013 // Determine whether \p K is a truncation based on an induction variable that 8014 // can be optimized. 8015 auto isOptimizableIVTruncate = 8016 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8017 return [=](ElementCount VF) -> bool { 8018 return CM.isOptimizableIVTruncate(K, VF); 8019 }; 8020 }; 8021 8022 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8023 isOptimizableIVTruncate(I), Range)) 8024 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8025 I); 8026 return nullptr; 8027 } 8028 8029 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8030 // We know that all PHIs in non-header blocks are converted into selects, so 8031 // we don't have to worry about the insertion order and we can just use the 8032 // builder. At this point we generate the predication tree. There may be 8033 // duplications since this is a simple recursive scan, but future 8034 // optimizations will clean it up. 8035 8036 SmallVector<VPValue *, 2> Operands; 8037 unsigned NumIncoming = Phi->getNumIncomingValues(); 8038 for (unsigned In = 0; In < NumIncoming; In++) { 8039 VPValue *EdgeMask = 8040 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8041 assert((EdgeMask || NumIncoming == 1) && 8042 "Multiple predecessors with one having a full mask"); 8043 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8044 if (EdgeMask) 8045 Operands.push_back(EdgeMask); 8046 } 8047 return new VPBlendRecipe(Phi, Operands); 8048 } 8049 8050 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8051 VPlan &Plan) const { 8052 8053 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8054 [this, CI](ElementCount VF) { 8055 return CM.isScalarWithPredication(CI, VF); 8056 }, 8057 Range); 8058 8059 if (IsPredicated) 8060 return nullptr; 8061 8062 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8063 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8064 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8065 ID == Intrinsic::pseudoprobe)) 8066 return nullptr; 8067 8068 auto willWiden = [&](ElementCount VF) -> bool { 8069 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8070 // The following case may be scalarized depending on the VF. 8071 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8072 // version of the instruction. 8073 // Is it beneficial to perform intrinsic call compared to lib call? 8074 bool NeedToScalarize = false; 8075 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8076 bool UseVectorIntrinsic = 8077 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8078 return UseVectorIntrinsic || !NeedToScalarize; 8079 }; 8080 8081 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8082 return nullptr; 8083 8084 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8085 } 8086 8087 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8088 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8089 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8090 // Instruction should be widened, unless it is scalar after vectorization, 8091 // scalarization is profitable or it is predicated. 8092 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8093 return CM.isScalarAfterVectorization(I, VF) || 8094 CM.isProfitableToScalarize(I, VF) || 8095 CM.isScalarWithPredication(I, VF); 8096 }; 8097 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8098 Range); 8099 } 8100 8101 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8102 auto IsVectorizableOpcode = [](unsigned Opcode) { 8103 switch (Opcode) { 8104 case Instruction::Add: 8105 case Instruction::And: 8106 case Instruction::AShr: 8107 case Instruction::BitCast: 8108 case Instruction::FAdd: 8109 case Instruction::FCmp: 8110 case Instruction::FDiv: 8111 case Instruction::FMul: 8112 case Instruction::FNeg: 8113 case Instruction::FPExt: 8114 case Instruction::FPToSI: 8115 case Instruction::FPToUI: 8116 case Instruction::FPTrunc: 8117 case Instruction::FRem: 8118 case Instruction::FSub: 8119 case Instruction::ICmp: 8120 case Instruction::IntToPtr: 8121 case Instruction::LShr: 8122 case Instruction::Mul: 8123 case Instruction::Or: 8124 case Instruction::PtrToInt: 8125 case Instruction::SDiv: 8126 case Instruction::Select: 8127 case Instruction::SExt: 8128 case Instruction::Shl: 8129 case Instruction::SIToFP: 8130 case Instruction::SRem: 8131 case Instruction::Sub: 8132 case Instruction::Trunc: 8133 case Instruction::UDiv: 8134 case Instruction::UIToFP: 8135 case Instruction::URem: 8136 case Instruction::Xor: 8137 case Instruction::ZExt: 8138 return true; 8139 } 8140 return false; 8141 }; 8142 8143 if (!IsVectorizableOpcode(I->getOpcode())) 8144 return nullptr; 8145 8146 // Success: widen this instruction. 8147 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8148 } 8149 8150 VPBasicBlock *VPRecipeBuilder::handleReplication( 8151 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8152 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8153 VPlanPtr &Plan) { 8154 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8155 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8156 Range); 8157 8158 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8159 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8160 Range); 8161 8162 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8163 IsUniform, IsPredicated); 8164 setRecipe(I, Recipe); 8165 Plan->addVPValue(I, Recipe); 8166 8167 // Find if I uses a predicated instruction. If so, it will use its scalar 8168 // value. Avoid hoisting the insert-element which packs the scalar value into 8169 // a vector value, as that happens iff all users use the vector value. 8170 for (auto &Op : I->operands()) 8171 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8172 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8173 PredInst2Recipe[PredInst]->setAlsoPack(false); 8174 8175 // Finalize the recipe for Instr, first if it is not predicated. 8176 if (!IsPredicated) { 8177 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8178 VPBB->appendRecipe(Recipe); 8179 return VPBB; 8180 } 8181 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8182 assert(VPBB->getSuccessors().empty() && 8183 "VPBB has successors when handling predicated replication."); 8184 // Record predicated instructions for above packing optimizations. 8185 PredInst2Recipe[I] = Recipe; 8186 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8187 VPBlockUtils::insertBlockAfter(Region, VPBB); 8188 auto *RegSucc = new VPBasicBlock(); 8189 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8190 return RegSucc; 8191 } 8192 8193 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8194 VPRecipeBase *PredRecipe, 8195 VPlanPtr &Plan) { 8196 // Instructions marked for predication are replicated and placed under an 8197 // if-then construct to prevent side-effects. 8198 8199 // Generate recipes to compute the block mask for this region. 8200 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8201 8202 // Build the triangular if-then region. 8203 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8204 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8205 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8206 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8207 auto *PHIRecipe = Instr->getType()->isVoidTy() 8208 ? nullptr 8209 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8210 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8211 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8212 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8213 8214 // Note: first set Entry as region entry and then connect successors starting 8215 // from it in order, to propagate the "parent" of each VPBasicBlock. 8216 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8217 VPBlockUtils::connectBlocks(Pred, Exit); 8218 8219 return Region; 8220 } 8221 8222 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8223 VFRange &Range, 8224 VPlanPtr &Plan) { 8225 // First, check for specific widening recipes that deal with calls, memory 8226 // operations, inductions and Phi nodes. 8227 if (auto *CI = dyn_cast<CallInst>(Instr)) 8228 return tryToWidenCall(CI, Range, *Plan); 8229 8230 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8231 return tryToWidenMemory(Instr, Range, Plan); 8232 8233 VPRecipeBase *Recipe; 8234 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8235 if (Phi->getParent() != OrigLoop->getHeader()) 8236 return tryToBlend(Phi, Plan); 8237 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 8238 return Recipe; 8239 return new VPWidenPHIRecipe(Phi); 8240 } 8241 8242 if (isa<TruncInst>(Instr) && 8243 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 8244 return Recipe; 8245 8246 if (!shouldWiden(Instr, Range)) 8247 return nullptr; 8248 8249 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8250 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8251 OrigLoop); 8252 8253 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8254 bool InvariantCond = 8255 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8256 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8257 InvariantCond); 8258 } 8259 8260 return tryToWiden(Instr, *Plan); 8261 } 8262 8263 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8264 ElementCount MaxVF) { 8265 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8266 8267 // Collect instructions from the original loop that will become trivially dead 8268 // in the vectorized loop. We don't need to vectorize these instructions. For 8269 // example, original induction update instructions can become dead because we 8270 // separately emit induction "steps" when generating code for the new loop. 8271 // Similarly, we create a new latch condition when setting up the structure 8272 // of the new loop, so the old one can become dead. 8273 SmallPtrSet<Instruction *, 4> DeadInstructions; 8274 collectTriviallyDeadInstructions(DeadInstructions); 8275 8276 // Add assume instructions we need to drop to DeadInstructions, to prevent 8277 // them from being added to the VPlan. 8278 // TODO: We only need to drop assumes in blocks that get flattend. If the 8279 // control flow is preserved, we should keep them. 8280 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8281 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8282 8283 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8284 // Dead instructions do not need sinking. Remove them from SinkAfter. 8285 for (Instruction *I : DeadInstructions) 8286 SinkAfter.erase(I); 8287 8288 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8289 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8290 VFRange SubRange = {VF, MaxVFPlusOne}; 8291 VPlans.push_back( 8292 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8293 VF = SubRange.End; 8294 } 8295 } 8296 8297 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8298 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8299 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8300 8301 // Hold a mapping from predicated instructions to their recipes, in order to 8302 // fix their AlsoPack behavior if a user is determined to replicate and use a 8303 // scalar instead of vector value. 8304 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8305 8306 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8307 8308 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8309 8310 // --------------------------------------------------------------------------- 8311 // Pre-construction: record ingredients whose recipes we'll need to further 8312 // process after constructing the initial VPlan. 8313 // --------------------------------------------------------------------------- 8314 8315 // Mark instructions we'll need to sink later and their targets as 8316 // ingredients whose recipe we'll need to record. 8317 for (auto &Entry : SinkAfter) { 8318 RecipeBuilder.recordRecipeOf(Entry.first); 8319 RecipeBuilder.recordRecipeOf(Entry.second); 8320 } 8321 for (auto &Reduction : CM.getInLoopReductionChains()) { 8322 PHINode *Phi = Reduction.first; 8323 RecurrenceDescriptor::RecurrenceKind Kind = 8324 Legal->getReductionVars()[Phi].getRecurrenceKind(); 8325 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8326 8327 RecipeBuilder.recordRecipeOf(Phi); 8328 for (auto &R : ReductionOperations) { 8329 RecipeBuilder.recordRecipeOf(R); 8330 // For min/max reducitons, where we have a pair of icmp/select, we also 8331 // need to record the ICmp recipe, so it can be removed later. 8332 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8333 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8334 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8335 } 8336 } 8337 } 8338 8339 // For each interleave group which is relevant for this (possibly trimmed) 8340 // Range, add it to the set of groups to be later applied to the VPlan and add 8341 // placeholders for its members' Recipes which we'll be replacing with a 8342 // single VPInterleaveRecipe. 8343 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8344 auto applyIG = [IG, this](ElementCount VF) -> bool { 8345 return (VF.isVector() && // Query is illegal for VF == 1 8346 CM.getWideningDecision(IG->getInsertPos(), VF) == 8347 LoopVectorizationCostModel::CM_Interleave); 8348 }; 8349 if (!getDecisionAndClampRange(applyIG, Range)) 8350 continue; 8351 InterleaveGroups.insert(IG); 8352 for (unsigned i = 0; i < IG->getFactor(); i++) 8353 if (Instruction *Member = IG->getMember(i)) 8354 RecipeBuilder.recordRecipeOf(Member); 8355 }; 8356 8357 // --------------------------------------------------------------------------- 8358 // Build initial VPlan: Scan the body of the loop in a topological order to 8359 // visit each basic block after having visited its predecessor basic blocks. 8360 // --------------------------------------------------------------------------- 8361 8362 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8363 auto Plan = std::make_unique<VPlan>(); 8364 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8365 Plan->setEntry(VPBB); 8366 8367 // Scan the body of the loop in a topological order to visit each basic block 8368 // after having visited its predecessor basic blocks. 8369 LoopBlocksDFS DFS(OrigLoop); 8370 DFS.perform(LI); 8371 8372 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8373 // Relevant instructions from basic block BB will be grouped into VPRecipe 8374 // ingredients and fill a new VPBasicBlock. 8375 unsigned VPBBsForBB = 0; 8376 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8377 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8378 VPBB = FirstVPBBForBB; 8379 Builder.setInsertPoint(VPBB); 8380 8381 // Introduce each ingredient into VPlan. 8382 // TODO: Model and preserve debug instrinsics in VPlan. 8383 for (Instruction &I : BB->instructionsWithoutDebug()) { 8384 Instruction *Instr = &I; 8385 8386 // First filter out irrelevant instructions, to ensure no recipes are 8387 // built for them. 8388 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8389 continue; 8390 8391 if (auto Recipe = 8392 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8393 // Check if the recipe can be converted to a VPValue. We need the extra 8394 // down-casting step until VPRecipeBase inherits from VPValue. 8395 VPValue *MaybeVPValue = Recipe->toVPValue(); 8396 if (!Instr->getType()->isVoidTy() && MaybeVPValue) 8397 Plan->addVPValue(Instr, MaybeVPValue); 8398 8399 RecipeBuilder.setRecipe(Instr, Recipe); 8400 VPBB->appendRecipe(Recipe); 8401 continue; 8402 } 8403 8404 // Otherwise, if all widening options failed, Instruction is to be 8405 // replicated. This may create a successor for VPBB. 8406 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8407 Instr, Range, VPBB, PredInst2Recipe, Plan); 8408 if (NextVPBB != VPBB) { 8409 VPBB = NextVPBB; 8410 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8411 : ""); 8412 } 8413 } 8414 } 8415 8416 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8417 // may also be empty, such as the last one VPBB, reflecting original 8418 // basic-blocks with no recipes. 8419 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8420 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8421 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8422 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8423 delete PreEntry; 8424 8425 // --------------------------------------------------------------------------- 8426 // Transform initial VPlan: Apply previously taken decisions, in order, to 8427 // bring the VPlan to its final state. 8428 // --------------------------------------------------------------------------- 8429 8430 // Apply Sink-After legal constraints. 8431 for (auto &Entry : SinkAfter) { 8432 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8433 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8434 Sink->moveAfter(Target); 8435 } 8436 8437 // Interleave memory: for each Interleave Group we marked earlier as relevant 8438 // for this VPlan, replace the Recipes widening its memory instructions with a 8439 // single VPInterleaveRecipe at its insertion point. 8440 for (auto IG : InterleaveGroups) { 8441 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8442 RecipeBuilder.getRecipe(IG->getInsertPos())); 8443 SmallVector<VPValue *, 4> StoredValues; 8444 for (unsigned i = 0; i < IG->getFactor(); ++i) 8445 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8446 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8447 8448 (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8449 Recipe->getMask())) 8450 ->insertBefore(Recipe); 8451 8452 for (unsigned i = 0; i < IG->getFactor(); ++i) 8453 if (Instruction *Member = IG->getMember(i)) { 8454 if (!Member->getType()->isVoidTy()) { 8455 VPValue *OriginalV = Plan->getVPValue(Member); 8456 Plan->removeVPValueFor(Member); 8457 OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member)); 8458 } 8459 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8460 } 8461 } 8462 8463 // Adjust the recipes for any inloop reductions. 8464 if (Range.Start.isVector()) 8465 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8466 8467 // Finally, if tail is folded by masking, introduce selects between the phi 8468 // and the live-out instruction of each reduction, at the end of the latch. 8469 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8470 Builder.setInsertPoint(VPBB); 8471 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8472 for (auto &Reduction : Legal->getReductionVars()) { 8473 if (CM.isInLoopReduction(Reduction.first)) 8474 continue; 8475 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8476 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8477 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8478 } 8479 } 8480 8481 std::string PlanName; 8482 raw_string_ostream RSO(PlanName); 8483 ElementCount VF = Range.Start; 8484 Plan->addVF(VF); 8485 RSO << "Initial VPlan for VF={" << VF; 8486 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8487 Plan->addVF(VF); 8488 RSO << "," << VF; 8489 } 8490 RSO << "},UF>=1"; 8491 RSO.flush(); 8492 Plan->setName(PlanName); 8493 8494 return Plan; 8495 } 8496 8497 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8498 // Outer loop handling: They may require CFG and instruction level 8499 // transformations before even evaluating whether vectorization is profitable. 8500 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8501 // the vectorization pipeline. 8502 assert(!OrigLoop->isInnermost()); 8503 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8504 8505 // Create new empty VPlan 8506 auto Plan = std::make_unique<VPlan>(); 8507 8508 // Build hierarchical CFG 8509 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8510 HCFGBuilder.buildHierarchicalCFG(); 8511 8512 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8513 VF *= 2) 8514 Plan->addVF(VF); 8515 8516 if (EnableVPlanPredication) { 8517 VPlanPredicator VPP(*Plan); 8518 VPP.predicate(); 8519 8520 // Avoid running transformation to recipes until masked code generation in 8521 // VPlan-native path is in place. 8522 return Plan; 8523 } 8524 8525 SmallPtrSet<Instruction *, 1> DeadInstructions; 8526 VPlanTransforms::VPInstructionsToVPRecipes( 8527 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8528 return Plan; 8529 } 8530 8531 // Adjust the recipes for any inloop reductions. The chain of instructions 8532 // leading from the loop exit instr to the phi need to be converted to 8533 // reductions, with one operand being vector and the other being the scalar 8534 // reduction chain. 8535 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8536 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8537 for (auto &Reduction : CM.getInLoopReductionChains()) { 8538 PHINode *Phi = Reduction.first; 8539 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8540 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8541 8542 // ReductionOperations are orders top-down from the phi's use to the 8543 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8544 // which of the two operands will remain scalar and which will be reduced. 8545 // For minmax the chain will be the select instructions. 8546 Instruction *Chain = Phi; 8547 for (Instruction *R : ReductionOperations) { 8548 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8549 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 8550 8551 VPValue *ChainOp = Plan->getVPValue(Chain); 8552 unsigned FirstOpId; 8553 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8554 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8555 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8556 "Expected to replace a VPWidenSelectSC"); 8557 FirstOpId = 1; 8558 } else { 8559 assert(isa<VPWidenRecipe>(WidenRecipe) && 8560 "Expected to replace a VPWidenSC"); 8561 FirstOpId = 0; 8562 } 8563 unsigned VecOpId = 8564 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8565 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8566 8567 auto *CondOp = CM.foldTailByMasking() 8568 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8569 : nullptr; 8570 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8571 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8572 WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe); 8573 Plan->removeVPValueFor(R); 8574 Plan->addVPValue(R, RedRecipe); 8575 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8576 WidenRecipe->eraseFromParent(); 8577 8578 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8579 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8580 VPRecipeBase *CompareRecipe = 8581 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8582 assert(isa<VPWidenRecipe>(CompareRecipe) && 8583 "Expected to replace a VPWidenSC"); 8584 assert(CompareRecipe->toVPValue()->getNumUsers() == 0 && 8585 "Expected no remaining users"); 8586 CompareRecipe->eraseFromParent(); 8587 } 8588 Chain = R; 8589 } 8590 } 8591 } 8592 8593 Value* LoopVectorizationPlanner::VPCallbackILV:: 8594 getOrCreateVectorValues(Value *V, unsigned Part) { 8595 return ILV.getOrCreateVectorValue(V, Part); 8596 } 8597 8598 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8599 Value *V, const VPIteration &Instance) { 8600 return ILV.getOrCreateScalarValue(V, Instance); 8601 } 8602 8603 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8604 VPSlotTracker &SlotTracker) const { 8605 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8606 IG->getInsertPos()->printAsOperand(O, false); 8607 O << ", "; 8608 getAddr()->printAsOperand(O, SlotTracker); 8609 VPValue *Mask = getMask(); 8610 if (Mask) { 8611 O << ", "; 8612 Mask->printAsOperand(O, SlotTracker); 8613 } 8614 for (unsigned i = 0; i < IG->getFactor(); ++i) 8615 if (Instruction *I = IG->getMember(i)) 8616 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8617 } 8618 8619 void VPWidenCallRecipe::execute(VPTransformState &State) { 8620 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8621 *this, State); 8622 } 8623 8624 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8625 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8626 this, *this, InvariantCond, State); 8627 } 8628 8629 void VPWidenRecipe::execute(VPTransformState &State) { 8630 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8631 } 8632 8633 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8634 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8635 *this, State.UF, State.VF, IsPtrLoopInvariant, 8636 IsIndexLoopInvariant, State); 8637 } 8638 8639 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8640 assert(!State.Instance && "Int or FP induction being replicated."); 8641 State.ILV->widenIntOrFpInduction(IV, Trunc); 8642 } 8643 8644 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8645 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8646 } 8647 8648 void VPBlendRecipe::execute(VPTransformState &State) { 8649 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8650 // We know that all PHIs in non-header blocks are converted into 8651 // selects, so we don't have to worry about the insertion order and we 8652 // can just use the builder. 8653 // At this point we generate the predication tree. There may be 8654 // duplications since this is a simple recursive scan, but future 8655 // optimizations will clean it up. 8656 8657 unsigned NumIncoming = getNumIncomingValues(); 8658 8659 // Generate a sequence of selects of the form: 8660 // SELECT(Mask3, In3, 8661 // SELECT(Mask2, In2, 8662 // SELECT(Mask1, In1, 8663 // In0))) 8664 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8665 // are essentially undef are taken from In0. 8666 InnerLoopVectorizer::VectorParts Entry(State.UF); 8667 for (unsigned In = 0; In < NumIncoming; ++In) { 8668 for (unsigned Part = 0; Part < State.UF; ++Part) { 8669 // We might have single edge PHIs (blocks) - use an identity 8670 // 'select' for the first PHI operand. 8671 Value *In0 = State.get(getIncomingValue(In), Part); 8672 if (In == 0) 8673 Entry[Part] = In0; // Initialize with the first incoming value. 8674 else { 8675 // Select between the current value and the previous incoming edge 8676 // based on the incoming mask. 8677 Value *Cond = State.get(getMask(In), Part); 8678 Entry[Part] = 8679 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8680 } 8681 } 8682 } 8683 for (unsigned Part = 0; Part < State.UF; ++Part) 8684 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8685 } 8686 8687 void VPInterleaveRecipe::execute(VPTransformState &State) { 8688 assert(!State.Instance && "Interleave group being replicated."); 8689 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(), 8690 getMask()); 8691 } 8692 8693 void VPReductionRecipe::execute(VPTransformState &State) { 8694 assert(!State.Instance && "Reduction being replicated."); 8695 for (unsigned Part = 0; Part < State.UF; ++Part) { 8696 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8697 Value *NewVecOp = State.get(getVecOp(), Part); 8698 if (VPValue *Cond = getCondOp()) { 8699 Value *NewCond = State.get(Cond, Part); 8700 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8701 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8702 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8703 Constant *IdenVec = 8704 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8705 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8706 NewVecOp = Select; 8707 } 8708 Value *NewRed = 8709 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8710 Value *PrevInChain = State.get(getChainOp(), Part); 8711 Value *NextInChain; 8712 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8713 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8714 NextInChain = 8715 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8716 NewRed, PrevInChain); 8717 } else { 8718 NextInChain = State.Builder.CreateBinOp( 8719 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8720 PrevInChain); 8721 } 8722 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8723 } 8724 } 8725 8726 void VPReplicateRecipe::execute(VPTransformState &State) { 8727 if (State.Instance) { // Generate a single instance. 8728 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8729 *State.Instance, IsPredicated, State); 8730 // Insert scalar instance packing it into a vector. 8731 if (AlsoPack && State.VF.isVector()) { 8732 // If we're constructing lane 0, initialize to start from undef. 8733 if (State.Instance->Lane == 0) { 8734 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8735 Value *Undef = UndefValue::get( 8736 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8737 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8738 State.Instance->Part, Undef); 8739 } 8740 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8741 *State.Instance); 8742 } 8743 return; 8744 } 8745 8746 // Generate scalar instances for all VF lanes of all UF parts, unless the 8747 // instruction is uniform inwhich case generate only the first lane for each 8748 // of the UF parts. 8749 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8750 for (unsigned Part = 0; Part < State.UF; ++Part) 8751 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8752 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8753 IsPredicated, State); 8754 } 8755 8756 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8757 assert(State.Instance && "Branch on Mask works only on single instance."); 8758 8759 unsigned Part = State.Instance->Part; 8760 unsigned Lane = State.Instance->Lane; 8761 8762 Value *ConditionBit = nullptr; 8763 VPValue *BlockInMask = getMask(); 8764 if (BlockInMask) { 8765 ConditionBit = State.get(BlockInMask, Part); 8766 if (ConditionBit->getType()->isVectorTy()) 8767 ConditionBit = State.Builder.CreateExtractElement( 8768 ConditionBit, State.Builder.getInt32(Lane)); 8769 } else // Block in mask is all-one. 8770 ConditionBit = State.Builder.getTrue(); 8771 8772 // Replace the temporary unreachable terminator with a new conditional branch, 8773 // whose two destinations will be set later when they are created. 8774 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8775 assert(isa<UnreachableInst>(CurrentTerminator) && 8776 "Expected to replace unreachable terminator with conditional branch."); 8777 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8778 CondBr->setSuccessor(0, nullptr); 8779 ReplaceInstWithInst(CurrentTerminator, CondBr); 8780 } 8781 8782 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8783 assert(State.Instance && "Predicated instruction PHI works per instance."); 8784 Instruction *ScalarPredInst = 8785 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8786 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8787 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8788 assert(PredicatingBB && "Predicated block has no single predecessor."); 8789 8790 // By current pack/unpack logic we need to generate only a single phi node: if 8791 // a vector value for the predicated instruction exists at this point it means 8792 // the instruction has vector users only, and a phi for the vector value is 8793 // needed. In this case the recipe of the predicated instruction is marked to 8794 // also do that packing, thereby "hoisting" the insert-element sequence. 8795 // Otherwise, a phi node for the scalar value is needed. 8796 unsigned Part = State.Instance->Part; 8797 Instruction *PredInst = 8798 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8799 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8800 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8801 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8802 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8803 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8804 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8805 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8806 } else { 8807 Type *PredInstType = PredInst->getType(); 8808 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8809 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8810 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8811 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8812 } 8813 } 8814 8815 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8816 Instruction *Instr = getUnderlyingInstr(); 8817 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8818 State.ILV->vectorizeMemoryInstruction(Instr, State, 8819 StoredValue ? nullptr : this, getAddr(), 8820 StoredValue, getMask()); 8821 } 8822 8823 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8824 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8825 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8826 // for predication. 8827 static ScalarEpilogueLowering getScalarEpilogueLowering( 8828 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8829 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8830 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8831 LoopVectorizationLegality &LVL) { 8832 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8833 // don't look at hints or options, and don't request a scalar epilogue. 8834 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8835 // LoopAccessInfo (due to code dependency and not being able to reliably get 8836 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8837 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8838 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8839 // back to the old way and vectorize with versioning when forced. See D81345.) 8840 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8841 PGSOQueryType::IRPass) && 8842 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8843 return CM_ScalarEpilogueNotAllowedOptSize; 8844 8845 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8846 !PreferPredicateOverEpilogue; 8847 8848 // 2) Next, if disabling predication is requested on the command line, honour 8849 // this and request a scalar epilogue. 8850 if (PredicateOptDisabled) 8851 return CM_ScalarEpilogueAllowed; 8852 8853 // 3) and 4) look if enabling predication is requested on the command line, 8854 // with a loop hint, or if the TTI hook indicates this is profitable, request 8855 // predication. 8856 if (PreferPredicateOverEpilogue || 8857 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8858 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8859 LVL.getLAI()) && 8860 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8861 return CM_ScalarEpilogueNotNeededUsePredicate; 8862 8863 return CM_ScalarEpilogueAllowed; 8864 } 8865 8866 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8867 unsigned Part) { 8868 set(Def, V, Part); 8869 ILV->setVectorValue(IRDef, Part, V); 8870 } 8871 8872 // Process the loop in the VPlan-native vectorization path. This path builds 8873 // VPlan upfront in the vectorization pipeline, which allows to apply 8874 // VPlan-to-VPlan transformations from the very beginning without modifying the 8875 // input LLVM IR. 8876 static bool processLoopInVPlanNativePath( 8877 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8878 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8879 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8880 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8881 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8882 8883 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8884 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8885 return false; 8886 } 8887 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8888 Function *F = L->getHeader()->getParent(); 8889 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8890 8891 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8892 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8893 8894 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8895 &Hints, IAI); 8896 // Use the planner for outer loop vectorization. 8897 // TODO: CM is not used at this point inside the planner. Turn CM into an 8898 // optional argument if we don't need it in the future. 8899 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8900 8901 // Get user vectorization factor. 8902 ElementCount UserVF = Hints.getWidth(); 8903 if (UserVF.isScalable()) { 8904 // TODO: Use scalable UserVF once we've added initial support for scalable 8905 // vectorization. For now we convert it to fixed width, but this will be 8906 // removed in a later patch. 8907 UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); 8908 } 8909 8910 // Plan how to best vectorize, return the best VF and its cost. 8911 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8912 8913 // If we are stress testing VPlan builds, do not attempt to generate vector 8914 // code. Masked vector code generation support will follow soon. 8915 // Also, do not attempt to vectorize if no vector code will be produced. 8916 if (VPlanBuildStressTest || EnableVPlanPredication || 8917 VectorizationFactor::Disabled() == VF) 8918 return false; 8919 8920 LVP.setBestPlan(VF.Width, 1); 8921 8922 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8923 &CM, BFI, PSI); 8924 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8925 << L->getHeader()->getParent()->getName() << "\"\n"); 8926 LVP.executePlan(LB, DT); 8927 8928 // Mark the loop as already vectorized to avoid vectorizing again. 8929 Hints.setAlreadyVectorized(); 8930 8931 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8932 return true; 8933 } 8934 8935 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8936 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8937 !EnableLoopInterleaving), 8938 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8939 !EnableLoopVectorization) {} 8940 8941 bool LoopVectorizePass::processLoop(Loop *L) { 8942 assert((EnableVPlanNativePath || L->isInnermost()) && 8943 "VPlan-native path is not enabled. Only process inner loops."); 8944 8945 #ifndef NDEBUG 8946 const std::string DebugLocStr = getDebugLocString(L); 8947 #endif /* NDEBUG */ 8948 8949 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8950 << L->getHeader()->getParent()->getName() << "\" from " 8951 << DebugLocStr << "\n"); 8952 8953 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8954 8955 LLVM_DEBUG( 8956 dbgs() << "LV: Loop hints:" 8957 << " force=" 8958 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8959 ? "disabled" 8960 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8961 ? "enabled" 8962 : "?")) 8963 << " width=" << Hints.getWidth() 8964 << " unroll=" << Hints.getInterleave() << "\n"); 8965 8966 // Function containing loop 8967 Function *F = L->getHeader()->getParent(); 8968 8969 // Looking at the diagnostic output is the only way to determine if a loop 8970 // was vectorized (other than looking at the IR or machine code), so it 8971 // is important to generate an optimization remark for each loop. Most of 8972 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8973 // generated as OptimizationRemark and OptimizationRemarkMissed are 8974 // less verbose reporting vectorized loops and unvectorized loops that may 8975 // benefit from vectorization, respectively. 8976 8977 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8978 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8979 return false; 8980 } 8981 8982 PredicatedScalarEvolution PSE(*SE, *L); 8983 8984 // Check if it is legal to vectorize the loop. 8985 LoopVectorizationRequirements Requirements(*ORE); 8986 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8987 &Requirements, &Hints, DB, AC, BFI, PSI); 8988 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8989 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8990 Hints.emitRemarkWithHints(); 8991 return false; 8992 } 8993 8994 // Check the function attributes and profiles to find out if this function 8995 // should be optimized for size. 8996 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8997 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8998 8999 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9000 // here. They may require CFG and instruction level transformations before 9001 // even evaluating whether vectorization is profitable. Since we cannot modify 9002 // the incoming IR, we need to build VPlan upfront in the vectorization 9003 // pipeline. 9004 if (!L->isInnermost()) 9005 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9006 ORE, BFI, PSI, Hints); 9007 9008 assert(L->isInnermost() && "Inner loop expected."); 9009 9010 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9011 // count by optimizing for size, to minimize overheads. 9012 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9013 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9014 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9015 << "This loop is worth vectorizing only if no scalar " 9016 << "iteration overheads are incurred."); 9017 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9018 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9019 else { 9020 LLVM_DEBUG(dbgs() << "\n"); 9021 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9022 } 9023 } 9024 9025 // Check the function attributes to see if implicit floats are allowed. 9026 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9027 // an integer loop and the vector instructions selected are purely integer 9028 // vector instructions? 9029 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9030 reportVectorizationFailure( 9031 "Can't vectorize when the NoImplicitFloat attribute is used", 9032 "loop not vectorized due to NoImplicitFloat attribute", 9033 "NoImplicitFloat", ORE, L); 9034 Hints.emitRemarkWithHints(); 9035 return false; 9036 } 9037 9038 // Check if the target supports potentially unsafe FP vectorization. 9039 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9040 // for the target we're vectorizing for, to make sure none of the 9041 // additional fp-math flags can help. 9042 if (Hints.isPotentiallyUnsafe() && 9043 TTI->isFPVectorizationPotentiallyUnsafe()) { 9044 reportVectorizationFailure( 9045 "Potentially unsafe FP op prevents vectorization", 9046 "loop not vectorized due to unsafe FP support.", 9047 "UnsafeFP", ORE, L); 9048 Hints.emitRemarkWithHints(); 9049 return false; 9050 } 9051 9052 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9053 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9054 9055 // If an override option has been passed in for interleaved accesses, use it. 9056 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9057 UseInterleaved = EnableInterleavedMemAccesses; 9058 9059 // Analyze interleaved memory accesses. 9060 if (UseInterleaved) { 9061 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9062 } 9063 9064 // Use the cost model. 9065 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9066 F, &Hints, IAI); 9067 CM.collectValuesToIgnore(); 9068 9069 // Use the planner for vectorization. 9070 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9071 9072 // Get user vectorization factor and interleave count. 9073 ElementCount UserVF = Hints.getWidth(); 9074 if (UserVF.isScalable()) { 9075 // TODO: Use scalable UserVF once we've added initial support for scalable 9076 // vectorization. For now we convert it to fixed width, but this will be 9077 // removed in a later patch. 9078 UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); 9079 } 9080 9081 unsigned UserIC = Hints.getInterleave(); 9082 9083 // Plan how to best vectorize, return the best VF and its cost. 9084 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9085 9086 VectorizationFactor VF = VectorizationFactor::Disabled(); 9087 unsigned IC = 1; 9088 9089 if (MaybeVF) { 9090 VF = *MaybeVF; 9091 // Select the interleave count. 9092 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9093 } 9094 9095 // Identify the diagnostic messages that should be produced. 9096 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9097 bool VectorizeLoop = true, InterleaveLoop = true; 9098 if (Requirements.doesNotMeet(F, L, Hints)) { 9099 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9100 "requirements.\n"); 9101 Hints.emitRemarkWithHints(); 9102 return false; 9103 } 9104 9105 if (VF.Width.isScalar()) { 9106 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9107 VecDiagMsg = std::make_pair( 9108 "VectorizationNotBeneficial", 9109 "the cost-model indicates that vectorization is not beneficial"); 9110 VectorizeLoop = false; 9111 } 9112 9113 if (!MaybeVF && UserIC > 1) { 9114 // Tell the user interleaving was avoided up-front, despite being explicitly 9115 // requested. 9116 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9117 "interleaving should be avoided up front\n"); 9118 IntDiagMsg = std::make_pair( 9119 "InterleavingAvoided", 9120 "Ignoring UserIC, because interleaving was avoided up front"); 9121 InterleaveLoop = false; 9122 } else if (IC == 1 && UserIC <= 1) { 9123 // Tell the user interleaving is not beneficial. 9124 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9125 IntDiagMsg = std::make_pair( 9126 "InterleavingNotBeneficial", 9127 "the cost-model indicates that interleaving is not beneficial"); 9128 InterleaveLoop = false; 9129 if (UserIC == 1) { 9130 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9131 IntDiagMsg.second += 9132 " and is explicitly disabled or interleave count is set to 1"; 9133 } 9134 } else if (IC > 1 && UserIC == 1) { 9135 // Tell the user interleaving is beneficial, but it explicitly disabled. 9136 LLVM_DEBUG( 9137 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9138 IntDiagMsg = std::make_pair( 9139 "InterleavingBeneficialButDisabled", 9140 "the cost-model indicates that interleaving is beneficial " 9141 "but is explicitly disabled or interleave count is set to 1"); 9142 InterleaveLoop = false; 9143 } 9144 9145 // Override IC if user provided an interleave count. 9146 IC = UserIC > 0 ? UserIC : IC; 9147 9148 // Emit diagnostic messages, if any. 9149 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9150 if (!VectorizeLoop && !InterleaveLoop) { 9151 // Do not vectorize or interleaving the loop. 9152 ORE->emit([&]() { 9153 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9154 L->getStartLoc(), L->getHeader()) 9155 << VecDiagMsg.second; 9156 }); 9157 ORE->emit([&]() { 9158 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9159 L->getStartLoc(), L->getHeader()) 9160 << IntDiagMsg.second; 9161 }); 9162 return false; 9163 } else if (!VectorizeLoop && InterleaveLoop) { 9164 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9165 ORE->emit([&]() { 9166 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9167 L->getStartLoc(), L->getHeader()) 9168 << VecDiagMsg.second; 9169 }); 9170 } else if (VectorizeLoop && !InterleaveLoop) { 9171 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9172 << ") in " << DebugLocStr << '\n'); 9173 ORE->emit([&]() { 9174 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9175 L->getStartLoc(), L->getHeader()) 9176 << IntDiagMsg.second; 9177 }); 9178 } else if (VectorizeLoop && InterleaveLoop) { 9179 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9180 << ") in " << DebugLocStr << '\n'); 9181 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9182 } 9183 9184 LVP.setBestPlan(VF.Width, IC); 9185 9186 using namespace ore; 9187 bool DisableRuntimeUnroll = false; 9188 MDNode *OrigLoopID = L->getLoopID(); 9189 9190 if (!VectorizeLoop) { 9191 assert(IC > 1 && "interleave count should not be 1 or 0"); 9192 // If we decided that it is not legal to vectorize the loop, then 9193 // interleave it. 9194 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9195 BFI, PSI); 9196 LVP.executePlan(Unroller, DT); 9197 9198 ORE->emit([&]() { 9199 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9200 L->getHeader()) 9201 << "interleaved loop (interleaved count: " 9202 << NV("InterleaveCount", IC) << ")"; 9203 }); 9204 } else { 9205 // If we decided that it is *legal* to vectorize the loop, then do it. 9206 9207 // Consider vectorizing the epilogue too if it's profitable. 9208 VectorizationFactor EpilogueVF = 9209 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9210 if (EpilogueVF.Width.isVector()) { 9211 9212 // The first pass vectorizes the main loop and creates a scalar epilogue 9213 // to be vectorized by executing the plan (potentially with a different 9214 // factor) again shortly afterwards. 9215 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9216 EpilogueVF.Width.getKnownMinValue(), 1); 9217 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9218 &LVL, &CM, BFI, PSI); 9219 9220 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9221 LVP.executePlan(MainILV, DT); 9222 ++LoopsVectorized; 9223 9224 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9225 formLCSSARecursively(*L, *DT, LI, SE); 9226 9227 // Second pass vectorizes the epilogue and adjusts the control flow 9228 // edges from the first pass. 9229 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9230 EPI.MainLoopVF = EPI.EpilogueVF; 9231 EPI.MainLoopUF = EPI.EpilogueUF; 9232 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9233 ORE, EPI, &LVL, &CM, BFI, PSI); 9234 LVP.executePlan(EpilogILV, DT); 9235 ++LoopsEpilogueVectorized; 9236 9237 if (!MainILV.areSafetyChecksAdded()) 9238 DisableRuntimeUnroll = true; 9239 } else { 9240 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9241 &LVL, &CM, BFI, PSI); 9242 LVP.executePlan(LB, DT); 9243 ++LoopsVectorized; 9244 9245 // Add metadata to disable runtime unrolling a scalar loop when there are 9246 // no runtime checks about strides and memory. A scalar loop that is 9247 // rarely used is not worth unrolling. 9248 if (!LB.areSafetyChecksAdded()) 9249 DisableRuntimeUnroll = true; 9250 } 9251 9252 // Report the vectorization decision. 9253 ORE->emit([&]() { 9254 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9255 L->getHeader()) 9256 << "vectorized loop (vectorization width: " 9257 << NV("VectorizationFactor", VF.Width) 9258 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9259 }); 9260 } 9261 9262 Optional<MDNode *> RemainderLoopID = 9263 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9264 LLVMLoopVectorizeFollowupEpilogue}); 9265 if (RemainderLoopID.hasValue()) { 9266 L->setLoopID(RemainderLoopID.getValue()); 9267 } else { 9268 if (DisableRuntimeUnroll) 9269 AddRuntimeUnrollDisableMetaData(L); 9270 9271 // Mark the loop as already vectorized to avoid vectorizing again. 9272 Hints.setAlreadyVectorized(); 9273 } 9274 9275 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9276 return true; 9277 } 9278 9279 LoopVectorizeResult LoopVectorizePass::runImpl( 9280 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9281 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9282 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9283 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9284 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9285 SE = &SE_; 9286 LI = &LI_; 9287 TTI = &TTI_; 9288 DT = &DT_; 9289 BFI = &BFI_; 9290 TLI = TLI_; 9291 AA = &AA_; 9292 AC = &AC_; 9293 GetLAA = &GetLAA_; 9294 DB = &DB_; 9295 ORE = &ORE_; 9296 PSI = PSI_; 9297 9298 // Don't attempt if 9299 // 1. the target claims to have no vector registers, and 9300 // 2. interleaving won't help ILP. 9301 // 9302 // The second condition is necessary because, even if the target has no 9303 // vector registers, loop vectorization may still enable scalar 9304 // interleaving. 9305 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9306 TTI->getMaxInterleaveFactor(1) < 2) 9307 return LoopVectorizeResult(false, false); 9308 9309 bool Changed = false, CFGChanged = false; 9310 9311 // The vectorizer requires loops to be in simplified form. 9312 // Since simplification may add new inner loops, it has to run before the 9313 // legality and profitability checks. This means running the loop vectorizer 9314 // will simplify all loops, regardless of whether anything end up being 9315 // vectorized. 9316 for (auto &L : *LI) 9317 Changed |= CFGChanged |= 9318 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9319 9320 // Build up a worklist of inner-loops to vectorize. This is necessary as 9321 // the act of vectorizing or partially unrolling a loop creates new loops 9322 // and can invalidate iterators across the loops. 9323 SmallVector<Loop *, 8> Worklist; 9324 9325 for (Loop *L : *LI) 9326 collectSupportedLoops(*L, LI, ORE, Worklist); 9327 9328 LoopsAnalyzed += Worklist.size(); 9329 9330 // Now walk the identified inner loops. 9331 while (!Worklist.empty()) { 9332 Loop *L = Worklist.pop_back_val(); 9333 9334 // For the inner loops we actually process, form LCSSA to simplify the 9335 // transform. 9336 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9337 9338 Changed |= CFGChanged |= processLoop(L); 9339 } 9340 9341 // Process each loop nest in the function. 9342 return LoopVectorizeResult(Changed, CFGChanged); 9343 } 9344 9345 PreservedAnalyses LoopVectorizePass::run(Function &F, 9346 FunctionAnalysisManager &AM) { 9347 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9348 auto &LI = AM.getResult<LoopAnalysis>(F); 9349 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9350 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9351 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9352 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9353 auto &AA = AM.getResult<AAManager>(F); 9354 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9355 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9356 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9357 MemorySSA *MSSA = EnableMSSALoopDependency 9358 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9359 : nullptr; 9360 9361 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9362 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9363 [&](Loop &L) -> const LoopAccessInfo & { 9364 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9365 TLI, TTI, nullptr, MSSA}; 9366 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9367 }; 9368 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9369 ProfileSummaryInfo *PSI = 9370 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9371 LoopVectorizeResult Result = 9372 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9373 if (!Result.MadeAnyChange) 9374 return PreservedAnalyses::all(); 9375 PreservedAnalyses PA; 9376 9377 // We currently do not preserve loopinfo/dominator analyses with outer loop 9378 // vectorization. Until this is addressed, mark these analyses as preserved 9379 // only for non-VPlan-native path. 9380 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9381 if (!EnableVPlanNativePath) { 9382 PA.preserve<LoopAnalysis>(); 9383 PA.preserve<DominatorTreeAnalysis>(); 9384 } 9385 PA.preserve<BasicAA>(); 9386 PA.preserve<GlobalsAA>(); 9387 if (!Result.MadeCFGChange) 9388 PA.preserveSet<CFGAnalyses>(); 9389 return PA; 9390 } 9391