1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(false), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 assert(!VF.isScalable() && "scalable vectors not yet supported."); 371 // Determine if an array of VF elements of type Ty is "bitcast compatible" 372 // with a <VF x Ty> vector. 373 if (VF.isVector()) { 374 auto *VectorTy = VectorType::get(Ty, VF); 375 return TypeSize::get(VF.getKnownMinValue() * 376 DL.getTypeAllocSize(Ty).getFixedValue(), 377 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 378 } 379 380 // If the vectorization factor is one, we just check if an array of type Ty 381 // requires padding between elements. 382 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 383 } 384 385 /// A helper function that returns the reciprocal of the block probability of 386 /// predicated blocks. If we return X, we are assuming the predicated block 387 /// will execute once for every X iterations of the loop header. 388 /// 389 /// TODO: We should use actual block probability here, if available. Currently, 390 /// we always assume predicated blocks have a 50% chance of executing. 391 static unsigned getReciprocalPredBlockProb() { return 2; } 392 393 /// A helper function that adds a 'fast' flag to floating-point operations. 394 static Value *addFastMathFlag(Value *V) { 395 if (isa<FPMathOperator>(V)) 396 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 397 return V; 398 } 399 400 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FMF); 403 return V; 404 } 405 406 /// A helper function that returns an integer or floating-point constant with 407 /// value C. 408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 409 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 410 : ConstantFP::get(Ty, C); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if it is known. 418 /// 4) Returns None if all of the above failed. 419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 420 // Check if exact trip count is known. 421 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 422 return ExpectedTC; 423 424 // Check if there is an expected trip count available from profile data. 425 if (LoopVectorizeWithBlockFrequency) 426 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 427 return EstimatedTC; 428 429 // Check if upper bound estimate is known. 430 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 431 return ExpectedTC; 432 433 return None; 434 } 435 436 namespace llvm { 437 438 /// InnerLoopVectorizer vectorizes loops which contain only one basic 439 /// block to a specified vectorization factor (VF). 440 /// This class performs the widening of scalars into vectors, or multiple 441 /// scalars. This class also implements the following features: 442 /// * It inserts an epilogue loop for handling loops that don't have iteration 443 /// counts that are known to be a multiple of the vectorization factor. 444 /// * It handles the code generation for reduction variables. 445 /// * Scalarization (implementation using scalars) of un-vectorizable 446 /// instructions. 447 /// InnerLoopVectorizer does not perform any vectorization-legality 448 /// checks, and relies on the caller to check for the different legality 449 /// aspects. The InnerLoopVectorizer relies on the 450 /// LoopVectorizationLegality class to provide information about the induction 451 /// and reduction variables that were found to a given vectorization factor. 452 class InnerLoopVectorizer { 453 public: 454 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 455 LoopInfo *LI, DominatorTree *DT, 456 const TargetLibraryInfo *TLI, 457 const TargetTransformInfo *TTI, AssumptionCache *AC, 458 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 459 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 460 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 461 ProfileSummaryInfo *PSI) 462 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 463 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 464 Builder(PSE.getSE()->getContext()), 465 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 466 BFI(BFI), PSI(PSI) { 467 // Query this against the original loop and save it here because the profile 468 // of the original loop header may change as the transformation happens. 469 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 470 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 471 } 472 473 virtual ~InnerLoopVectorizer() = default; 474 475 /// Create a new empty loop that will contain vectorized instructions later 476 /// on, while the old loop will be used as the scalar remainder. Control flow 477 /// is generated around the vectorized (and scalar epilogue) loops consisting 478 /// of various checks and bypasses. Return the pre-header block of the new 479 /// loop. 480 /// In the case of epilogue vectorization, this function is overriden to 481 /// handle the more complex control flow around the loops. 482 virtual BasicBlock *createVectorizedLoopSkeleton(); 483 484 /// Widen a single instruction within the innermost loop. 485 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 486 VPTransformState &State); 487 488 /// Widen a single call instruction within the innermost loop. 489 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 490 VPTransformState &State); 491 492 /// Widen a single select instruction within the innermost loop. 493 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 494 bool InvariantCond, VPTransformState &State); 495 496 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 497 void fixVectorizedLoop(); 498 499 // Return true if any runtime check is added. 500 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 501 502 /// A type for vectorized values in the new loop. Each value from the 503 /// original loop, when vectorized, is represented by UF vector values in the 504 /// new unrolled loop, where UF is the unroll factor. 505 using VectorParts = SmallVector<Value *, 2>; 506 507 /// Vectorize a single GetElementPtrInst based on information gathered and 508 /// decisions taken during planning. 509 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 510 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 511 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 512 513 /// Vectorize a single PHINode in a block. This method handles the induction 514 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 515 /// arbitrary length vectors. 516 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 517 518 /// A helper function to scalarize a single Instruction in the innermost loop. 519 /// Generates a sequence of scalar instances for each lane between \p MinLane 520 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 521 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 522 /// Instr's operands. 523 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 524 const VPIteration &Instance, bool IfPredicateInstr, 525 VPTransformState &State); 526 527 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 528 /// is provided, the integer induction variable will first be truncated to 529 /// the corresponding type. 530 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 531 532 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 533 /// vector or scalar value on-demand if one is not yet available. When 534 /// vectorizing a loop, we visit the definition of an instruction before its 535 /// uses. When visiting the definition, we either vectorize or scalarize the 536 /// instruction, creating an entry for it in the corresponding map. (In some 537 /// cases, such as induction variables, we will create both vector and scalar 538 /// entries.) Then, as we encounter uses of the definition, we derive values 539 /// for each scalar or vector use unless such a value is already available. 540 /// For example, if we scalarize a definition and one of its uses is vector, 541 /// we build the required vector on-demand with an insertelement sequence 542 /// when visiting the use. Otherwise, if the use is scalar, we can use the 543 /// existing scalar definition. 544 /// 545 /// Return a value in the new loop corresponding to \p V from the original 546 /// loop at unroll index \p Part. If the value has already been vectorized, 547 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 548 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 549 /// a new vector value on-demand by inserting the scalar values into a vector 550 /// with an insertelement sequence. If the value has been neither vectorized 551 /// nor scalarized, it must be loop invariant, so we simply broadcast the 552 /// value into a vector. 553 Value *getOrCreateVectorValue(Value *V, unsigned Part); 554 555 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 556 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 557 } 558 559 /// Return a value in the new loop corresponding to \p V from the original 560 /// loop at unroll and vector indices \p Instance. If the value has been 561 /// vectorized but not scalarized, the necessary extractelement instruction 562 /// will be generated. 563 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 564 565 /// Construct the vector value of a scalarized value \p V one lane at a time. 566 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 567 568 /// Try to vectorize interleaved access group \p Group with the base address 569 /// given in \p Addr, optionally masking the vector operations if \p 570 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 571 /// values in the vectorized loop. 572 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 573 VPTransformState &State, VPValue *Addr, 574 ArrayRef<VPValue *> StoredValues, 575 VPValue *BlockInMask = nullptr); 576 577 /// Vectorize Load and Store instructions with the base address given in \p 578 /// Addr, optionally masking the vector operations if \p BlockInMask is 579 /// non-null. Use \p State to translate given VPValues to IR values in the 580 /// vectorized loop. 581 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 582 VPValue *Def, VPValue *Addr, 583 VPValue *StoredValue, VPValue *BlockInMask); 584 585 /// Set the debug location in the builder using the debug location in 586 /// the instruction. 587 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 588 589 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 590 void fixNonInductionPHIs(void); 591 592 protected: 593 friend class LoopVectorizationPlanner; 594 595 /// A small list of PHINodes. 596 using PhiVector = SmallVector<PHINode *, 4>; 597 598 /// A type for scalarized values in the new loop. Each value from the 599 /// original loop, when scalarized, is represented by UF x VF scalar values 600 /// in the new unrolled loop, where UF is the unroll factor and VF is the 601 /// vectorization factor. 602 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 603 604 /// Set up the values of the IVs correctly when exiting the vector loop. 605 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 606 Value *CountRoundDown, Value *EndValue, 607 BasicBlock *MiddleBlock); 608 609 /// Create a new induction variable inside L. 610 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 611 Value *Step, Instruction *DL); 612 613 /// Handle all cross-iteration phis in the header. 614 void fixCrossIterationPHIs(); 615 616 /// Fix a first-order recurrence. This is the second phase of vectorizing 617 /// this phi node. 618 void fixFirstOrderRecurrence(PHINode *Phi); 619 620 /// Fix a reduction cross-iteration phi. This is the second phase of 621 /// vectorizing this phi node. 622 void fixReduction(PHINode *Phi); 623 624 /// Clear NSW/NUW flags from reduction instructions if necessary. 625 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 626 627 /// The Loop exit block may have single value PHI nodes with some 628 /// incoming value. While vectorizing we only handled real values 629 /// that were defined inside the loop and we should have one value for 630 /// each predecessor of its parent basic block. See PR14725. 631 void fixLCSSAPHIs(); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(); 640 641 /// Create a broadcast instruction. This method generates a broadcast 642 /// instruction (shuffle) for loop invariant values and for the induction 643 /// value. If this is the induction variable then we extend it to N, N+1, ... 644 /// this is needed because each iteration in the loop corresponds to a SIMD 645 /// element. 646 virtual Value *getBroadcastInstrs(Value *V); 647 648 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 649 /// to each vector element of Val. The sequence starts at StartIndex. 650 /// \p Opcode is relevant for FP induction variable. 651 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 652 Instruction::BinaryOps Opcode = 653 Instruction::BinaryOpsEnd); 654 655 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 656 /// variable on which to base the steps, \p Step is the size of the step, and 657 /// \p EntryVal is the value from the original loop that maps to the steps. 658 /// Note that \p EntryVal doesn't have to be an induction variable - it 659 /// can also be a truncate instruction. 660 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 661 const InductionDescriptor &ID); 662 663 /// Create a vector induction phi node based on an existing scalar one. \p 664 /// EntryVal is the value from the original loop that maps to the vector phi 665 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 666 /// truncate instruction, instead of widening the original IV, we widen a 667 /// version of the IV truncated to \p EntryVal's type. 668 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 669 Value *Step, Instruction *EntryVal); 670 671 /// Returns true if an instruction \p I should be scalarized instead of 672 /// vectorized for the chosen vectorization factor. 673 bool shouldScalarizeInstruction(Instruction *I) const; 674 675 /// Returns true if we should generate a scalar version of \p IV. 676 bool needsScalarInduction(Instruction *IV) const; 677 678 /// If there is a cast involved in the induction variable \p ID, which should 679 /// be ignored in the vectorized loop body, this function records the 680 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 681 /// cast. We had already proved that the casted Phi is equal to the uncasted 682 /// Phi in the vectorized loop (under a runtime guard), and therefore 683 /// there is no need to vectorize the cast - the same value can be used in the 684 /// vector loop for both the Phi and the cast. 685 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 686 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 687 /// 688 /// \p EntryVal is the value from the original loop that maps to the vector 689 /// phi node and is used to distinguish what is the IV currently being 690 /// processed - original one (if \p EntryVal is a phi corresponding to the 691 /// original IV) or the "newly-created" one based on the proof mentioned above 692 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 693 /// latter case \p EntryVal is a TruncInst and we must not record anything for 694 /// that IV, but it's error-prone to expect callers of this routine to care 695 /// about that, hence this explicit parameter. 696 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 697 const Instruction *EntryVal, 698 Value *VectorLoopValue, 699 unsigned Part, 700 unsigned Lane = UINT_MAX); 701 702 /// Generate a shuffle sequence that will reverse the vector Vec. 703 virtual Value *reverseVector(Value *Vec); 704 705 /// Returns (and creates if needed) the original loop trip count. 706 Value *getOrCreateTripCount(Loop *NewLoop); 707 708 /// Returns (and creates if needed) the trip count of the widened loop. 709 Value *getOrCreateVectorTripCount(Loop *NewLoop); 710 711 /// Returns a bitcasted value to the requested vector type. 712 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 713 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 714 const DataLayout &DL); 715 716 /// Emit a bypass check to see if the vector trip count is zero, including if 717 /// it overflows. 718 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 719 720 /// Emit a bypass check to see if all of the SCEV assumptions we've 721 /// had to make are correct. 722 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 723 724 /// Emit bypass checks to check any memory assumptions we may have made. 725 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 726 727 /// Compute the transformed value of Index at offset StartValue using step 728 /// StepValue. 729 /// For integer induction, returns StartValue + Index * StepValue. 730 /// For pointer induction, returns StartValue[Index * StepValue]. 731 /// FIXME: The newly created binary instructions should contain nsw/nuw 732 /// flags, which can be found from the original scalar operations. 733 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 734 const DataLayout &DL, 735 const InductionDescriptor &ID) const; 736 737 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 738 /// vector loop preheader, middle block and scalar preheader. Also 739 /// allocate a loop object for the new vector loop and return it. 740 Loop *createVectorLoopSkeleton(StringRef Prefix); 741 742 /// Create new phi nodes for the induction variables to resume iteration count 743 /// in the scalar epilogue, from where the vectorized loop left off (given by 744 /// \p VectorTripCount). 745 /// In cases where the loop skeleton is more complicated (eg. epilogue 746 /// vectorization) and the resume values can come from an additional bypass 747 /// block, the \p AdditionalBypass pair provides information about the bypass 748 /// block and the end value on the edge from bypass to this loop. 749 void createInductionResumeValues( 750 Loop *L, Value *VectorTripCount, 751 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 752 753 /// Complete the loop skeleton by adding debug MDs, creating appropriate 754 /// conditional branches in the middle block, preparing the builder and 755 /// running the verifier. Take in the vector loop \p L as argument, and return 756 /// the preheader of the completed vector loop. 757 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 758 759 /// Add additional metadata to \p To that was not present on \p Orig. 760 /// 761 /// Currently this is used to add the noalias annotations based on the 762 /// inserted memchecks. Use this for instructions that are *cloned* into the 763 /// vector loop. 764 void addNewMetadata(Instruction *To, const Instruction *Orig); 765 766 /// Add metadata from one instruction to another. 767 /// 768 /// This includes both the original MDs from \p From and additional ones (\see 769 /// addNewMetadata). Use this for *newly created* instructions in the vector 770 /// loop. 771 void addMetadata(Instruction *To, Instruction *From); 772 773 /// Similar to the previous function but it adds the metadata to a 774 /// vector of instructions. 775 void addMetadata(ArrayRef<Value *> To, Instruction *From); 776 777 /// Allow subclasses to override and print debug traces before/after vplan 778 /// execution, when trace information is requested. 779 virtual void printDebugTracesAtStart(){}; 780 virtual void printDebugTracesAtEnd(){}; 781 782 /// The original loop. 783 Loop *OrigLoop; 784 785 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 786 /// dynamic knowledge to simplify SCEV expressions and converts them to a 787 /// more usable form. 788 PredicatedScalarEvolution &PSE; 789 790 /// Loop Info. 791 LoopInfo *LI; 792 793 /// Dominator Tree. 794 DominatorTree *DT; 795 796 /// Alias Analysis. 797 AAResults *AA; 798 799 /// Target Library Info. 800 const TargetLibraryInfo *TLI; 801 802 /// Target Transform Info. 803 const TargetTransformInfo *TTI; 804 805 /// Assumption Cache. 806 AssumptionCache *AC; 807 808 /// Interface to emit optimization remarks. 809 OptimizationRemarkEmitter *ORE; 810 811 /// LoopVersioning. It's only set up (non-null) if memchecks were 812 /// used. 813 /// 814 /// This is currently only used to add no-alias metadata based on the 815 /// memchecks. The actually versioning is performed manually. 816 std::unique_ptr<LoopVersioning> LVer; 817 818 /// The vectorization SIMD factor to use. Each vector will have this many 819 /// vector elements. 820 ElementCount VF; 821 822 /// The vectorization unroll factor to use. Each scalar is vectorized to this 823 /// many different vector instructions. 824 unsigned UF; 825 826 /// The builder that we use 827 IRBuilder<> Builder; 828 829 // --- Vectorization state --- 830 831 /// The vector-loop preheader. 832 BasicBlock *LoopVectorPreHeader; 833 834 /// The scalar-loop preheader. 835 BasicBlock *LoopScalarPreHeader; 836 837 /// Middle Block between the vector and the scalar. 838 BasicBlock *LoopMiddleBlock; 839 840 /// The ExitBlock of the scalar loop. 841 BasicBlock *LoopExitBlock; 842 843 /// The vector loop body. 844 BasicBlock *LoopVectorBody; 845 846 /// The scalar loop body. 847 BasicBlock *LoopScalarBody; 848 849 /// A list of all bypass blocks. The first block is the entry of the loop. 850 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 851 852 /// The new Induction variable which was added to the new block. 853 PHINode *Induction = nullptr; 854 855 /// The induction variable of the old basic block. 856 PHINode *OldInduction = nullptr; 857 858 /// Maps values from the original loop to their corresponding values in the 859 /// vectorized loop. A key value can map to either vector values, scalar 860 /// values or both kinds of values, depending on whether the key was 861 /// vectorized and scalarized. 862 VectorizerValueMap VectorLoopValueMap; 863 864 /// Store instructions that were predicated. 865 SmallVector<Instruction *, 4> PredicatedInstructions; 866 867 /// Trip count of the original loop. 868 Value *TripCount = nullptr; 869 870 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 871 Value *VectorTripCount = nullptr; 872 873 /// The legality analysis. 874 LoopVectorizationLegality *Legal; 875 876 /// The profitablity analysis. 877 LoopVectorizationCostModel *Cost; 878 879 // Record whether runtime checks are added. 880 bool AddedSafetyChecks = false; 881 882 // Holds the end values for each induction variable. We save the end values 883 // so we can later fix-up the external users of the induction variables. 884 DenseMap<PHINode *, Value *> IVEndValues; 885 886 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 887 // fixed up at the end of vector code generation. 888 SmallVector<PHINode *, 8> OrigPHIsToFix; 889 890 /// BFI and PSI are used to check for profile guided size optimizations. 891 BlockFrequencyInfo *BFI; 892 ProfileSummaryInfo *PSI; 893 894 // Whether this loop should be optimized for size based on profile guided size 895 // optimizatios. 896 bool OptForSizeBasedOnProfile; 897 }; 898 899 class InnerLoopUnroller : public InnerLoopVectorizer { 900 public: 901 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 902 LoopInfo *LI, DominatorTree *DT, 903 const TargetLibraryInfo *TLI, 904 const TargetTransformInfo *TTI, AssumptionCache *AC, 905 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 906 LoopVectorizationLegality *LVL, 907 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 908 ProfileSummaryInfo *PSI) 909 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 910 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 911 BFI, PSI) {} 912 913 private: 914 Value *getBroadcastInstrs(Value *V) override; 915 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 916 Instruction::BinaryOps Opcode = 917 Instruction::BinaryOpsEnd) override; 918 Value *reverseVector(Value *Vec) override; 919 }; 920 921 /// Encapsulate information regarding vectorization of a loop and its epilogue. 922 /// This information is meant to be updated and used across two stages of 923 /// epilogue vectorization. 924 struct EpilogueLoopVectorizationInfo { 925 ElementCount MainLoopVF = ElementCount::getFixed(0); 926 unsigned MainLoopUF = 0; 927 ElementCount EpilogueVF = ElementCount::getFixed(0); 928 unsigned EpilogueUF = 0; 929 BasicBlock *MainLoopIterationCountCheck = nullptr; 930 BasicBlock *EpilogueIterationCountCheck = nullptr; 931 BasicBlock *SCEVSafetyCheck = nullptr; 932 BasicBlock *MemSafetyCheck = nullptr; 933 Value *TripCount = nullptr; 934 Value *VectorTripCount = nullptr; 935 936 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 937 unsigned EUF) 938 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 939 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 940 assert(EUF == 1 && 941 "A high UF for the epilogue loop is likely not beneficial."); 942 } 943 }; 944 945 /// An extension of the inner loop vectorizer that creates a skeleton for a 946 /// vectorized loop that has its epilogue (residual) also vectorized. 947 /// The idea is to run the vplan on a given loop twice, firstly to setup the 948 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 949 /// from the first step and vectorize the epilogue. This is achieved by 950 /// deriving two concrete strategy classes from this base class and invoking 951 /// them in succession from the loop vectorizer planner. 952 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 953 public: 954 InnerLoopAndEpilogueVectorizer( 955 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 956 DominatorTree *DT, const TargetLibraryInfo *TLI, 957 const TargetTransformInfo *TTI, AssumptionCache *AC, 958 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 959 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 960 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 961 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 962 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 963 EPI(EPI) {} 964 965 // Override this function to handle the more complex control flow around the 966 // three loops. 967 BasicBlock *createVectorizedLoopSkeleton() final override { 968 return createEpilogueVectorizedLoopSkeleton(); 969 } 970 971 /// The interface for creating a vectorized skeleton using one of two 972 /// different strategies, each corresponding to one execution of the vplan 973 /// as described above. 974 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 975 976 /// Holds and updates state information required to vectorize the main loop 977 /// and its epilogue in two separate passes. This setup helps us avoid 978 /// regenerating and recomputing runtime safety checks. It also helps us to 979 /// shorten the iteration-count-check path length for the cases where the 980 /// iteration count of the loop is so small that the main vector loop is 981 /// completely skipped. 982 EpilogueLoopVectorizationInfo &EPI; 983 }; 984 985 /// A specialized derived class of inner loop vectorizer that performs 986 /// vectorization of *main* loops in the process of vectorizing loops and their 987 /// epilogues. 988 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 989 public: 990 EpilogueVectorizerMainLoop( 991 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 992 DominatorTree *DT, const TargetLibraryInfo *TLI, 993 const TargetTransformInfo *TTI, AssumptionCache *AC, 994 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 995 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 996 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 997 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 998 EPI, LVL, CM, BFI, PSI) {} 999 /// Implements the interface for creating a vectorized skeleton using the 1000 /// *main loop* strategy (ie the first pass of vplan execution). 1001 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1002 1003 protected: 1004 /// Emits an iteration count bypass check once for the main loop (when \p 1005 /// ForEpilogue is false) and once for the epilogue loop (when \p 1006 /// ForEpilogue is true). 1007 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1008 bool ForEpilogue); 1009 void printDebugTracesAtStart() override; 1010 void printDebugTracesAtEnd() override; 1011 }; 1012 1013 // A specialized derived class of inner loop vectorizer that performs 1014 // vectorization of *epilogue* loops in the process of vectorizing loops and 1015 // their epilogues. 1016 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1017 public: 1018 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1019 LoopInfo *LI, DominatorTree *DT, 1020 const TargetLibraryInfo *TLI, 1021 const TargetTransformInfo *TTI, AssumptionCache *AC, 1022 OptimizationRemarkEmitter *ORE, 1023 EpilogueLoopVectorizationInfo &EPI, 1024 LoopVectorizationLegality *LVL, 1025 llvm::LoopVectorizationCostModel *CM, 1026 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1027 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1028 EPI, LVL, CM, BFI, PSI) {} 1029 /// Implements the interface for creating a vectorized skeleton using the 1030 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1031 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1032 1033 protected: 1034 /// Emits an iteration count bypass check after the main vector loop has 1035 /// finished to see if there are any iterations left to execute by either 1036 /// the vector epilogue or the scalar epilogue. 1037 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1038 BasicBlock *Bypass, 1039 BasicBlock *Insert); 1040 void printDebugTracesAtStart() override; 1041 void printDebugTracesAtEnd() override; 1042 }; 1043 } // end namespace llvm 1044 1045 /// Look for a meaningful debug location on the instruction or it's 1046 /// operands. 1047 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1048 if (!I) 1049 return I; 1050 1051 DebugLoc Empty; 1052 if (I->getDebugLoc() != Empty) 1053 return I; 1054 1055 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1056 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1057 if (OpInst->getDebugLoc() != Empty) 1058 return OpInst; 1059 } 1060 1061 return I; 1062 } 1063 1064 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1065 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1066 const DILocation *DIL = Inst->getDebugLoc(); 1067 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1068 !isa<DbgInfoIntrinsic>(Inst)) { 1069 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1070 auto NewDIL = 1071 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1072 if (NewDIL) 1073 B.SetCurrentDebugLocation(NewDIL.getValue()); 1074 else 1075 LLVM_DEBUG(dbgs() 1076 << "Failed to create new discriminator: " 1077 << DIL->getFilename() << " Line: " << DIL->getLine()); 1078 } 1079 else 1080 B.SetCurrentDebugLocation(DIL); 1081 } else 1082 B.SetCurrentDebugLocation(DebugLoc()); 1083 } 1084 1085 /// Write a record \p DebugMsg about vectorization failure to the debug 1086 /// output stream. If \p I is passed, it is an instruction that prevents 1087 /// vectorization. 1088 #ifndef NDEBUG 1089 static void debugVectorizationFailure(const StringRef DebugMsg, 1090 Instruction *I) { 1091 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1092 if (I != nullptr) 1093 dbgs() << " " << *I; 1094 else 1095 dbgs() << '.'; 1096 dbgs() << '\n'; 1097 } 1098 #endif 1099 1100 /// Create an analysis remark that explains why vectorization failed 1101 /// 1102 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1103 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1104 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1105 /// the location of the remark. \return the remark object that can be 1106 /// streamed to. 1107 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1108 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1109 Value *CodeRegion = TheLoop->getHeader(); 1110 DebugLoc DL = TheLoop->getStartLoc(); 1111 1112 if (I) { 1113 CodeRegion = I->getParent(); 1114 // If there is no debug location attached to the instruction, revert back to 1115 // using the loop's. 1116 if (I->getDebugLoc()) 1117 DL = I->getDebugLoc(); 1118 } 1119 1120 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1121 R << "loop not vectorized: "; 1122 return R; 1123 } 1124 1125 namespace llvm { 1126 1127 void reportVectorizationFailure(const StringRef DebugMsg, 1128 const StringRef OREMsg, const StringRef ORETag, 1129 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1130 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1131 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1132 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1133 ORETag, TheLoop, I) << OREMsg); 1134 } 1135 1136 } // end namespace llvm 1137 1138 #ifndef NDEBUG 1139 /// \return string containing a file name and a line # for the given loop. 1140 static std::string getDebugLocString(const Loop *L) { 1141 std::string Result; 1142 if (L) { 1143 raw_string_ostream OS(Result); 1144 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1145 LoopDbgLoc.print(OS); 1146 else 1147 // Just print the module name. 1148 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1149 OS.flush(); 1150 } 1151 return Result; 1152 } 1153 #endif 1154 1155 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1156 const Instruction *Orig) { 1157 // If the loop was versioned with memchecks, add the corresponding no-alias 1158 // metadata. 1159 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1160 LVer->annotateInstWithNoAlias(To, Orig); 1161 } 1162 1163 void InnerLoopVectorizer::addMetadata(Instruction *To, 1164 Instruction *From) { 1165 propagateMetadata(To, From); 1166 addNewMetadata(To, From); 1167 } 1168 1169 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1170 Instruction *From) { 1171 for (Value *V : To) { 1172 if (Instruction *I = dyn_cast<Instruction>(V)) 1173 addMetadata(I, From); 1174 } 1175 } 1176 1177 namespace llvm { 1178 1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1180 // lowered. 1181 enum ScalarEpilogueLowering { 1182 1183 // The default: allowing scalar epilogues. 1184 CM_ScalarEpilogueAllowed, 1185 1186 // Vectorization with OptForSize: don't allow epilogues. 1187 CM_ScalarEpilogueNotAllowedOptSize, 1188 1189 // A special case of vectorisation with OptForSize: loops with a very small 1190 // trip count are considered for vectorization under OptForSize, thereby 1191 // making sure the cost of their loop body is dominant, free of runtime 1192 // guards and scalar iteration overheads. 1193 CM_ScalarEpilogueNotAllowedLowTripLoop, 1194 1195 // Loop hint predicate indicating an epilogue is undesired. 1196 CM_ScalarEpilogueNotNeededUsePredicate 1197 }; 1198 1199 /// LoopVectorizationCostModel - estimates the expected speedups due to 1200 /// vectorization. 1201 /// In many cases vectorization is not profitable. This can happen because of 1202 /// a number of reasons. In this class we mainly attempt to predict the 1203 /// expected speedup/slowdowns due to the supported instruction set. We use the 1204 /// TargetTransformInfo to query the different backends for the cost of 1205 /// different operations. 1206 class LoopVectorizationCostModel { 1207 public: 1208 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1209 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1210 LoopVectorizationLegality *Legal, 1211 const TargetTransformInfo &TTI, 1212 const TargetLibraryInfo *TLI, DemandedBits *DB, 1213 AssumptionCache *AC, 1214 OptimizationRemarkEmitter *ORE, const Function *F, 1215 const LoopVectorizeHints *Hints, 1216 InterleavedAccessInfo &IAI) 1217 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1218 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1219 Hints(Hints), InterleaveInfo(IAI) {} 1220 1221 /// \return An upper bound for the vectorization factor, or None if 1222 /// vectorization and interleaving should be avoided up front. 1223 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1224 1225 /// \return True if runtime checks are required for vectorization, and false 1226 /// otherwise. 1227 bool runtimeChecksRequired(); 1228 1229 /// \return The most profitable vectorization factor and the cost of that VF. 1230 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1231 /// then this vectorization factor will be selected if vectorization is 1232 /// possible. 1233 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1234 VectorizationFactor 1235 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1236 const LoopVectorizationPlanner &LVP); 1237 1238 /// Setup cost-based decisions for user vectorization factor. 1239 void selectUserVectorizationFactor(ElementCount UserVF) { 1240 collectUniformsAndScalars(UserVF); 1241 collectInstsToScalarize(UserVF); 1242 } 1243 1244 /// \return The size (in bits) of the smallest and widest types in the code 1245 /// that needs to be vectorized. We ignore values that remain scalar such as 1246 /// 64 bit loop indices. 1247 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1248 1249 /// \return The desired interleave count. 1250 /// If interleave count has been specified by metadata it will be returned. 1251 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1252 /// are the selected vectorization factor and the cost of the selected VF. 1253 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1254 1255 /// Memory access instruction may be vectorized in more than one way. 1256 /// Form of instruction after vectorization depends on cost. 1257 /// This function takes cost-based decisions for Load/Store instructions 1258 /// and collects them in a map. This decisions map is used for building 1259 /// the lists of loop-uniform and loop-scalar instructions. 1260 /// The calculated cost is saved with widening decision in order to 1261 /// avoid redundant calculations. 1262 void setCostBasedWideningDecision(ElementCount VF); 1263 1264 /// A struct that represents some properties of the register usage 1265 /// of a loop. 1266 struct RegisterUsage { 1267 /// Holds the number of loop invariant values that are used in the loop. 1268 /// The key is ClassID of target-provided register class. 1269 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1270 /// Holds the maximum number of concurrent live intervals in the loop. 1271 /// The key is ClassID of target-provided register class. 1272 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1273 }; 1274 1275 /// \return Returns information about the register usages of the loop for the 1276 /// given vectorization factors. 1277 SmallVector<RegisterUsage, 8> 1278 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1279 1280 /// Collect values we want to ignore in the cost model. 1281 void collectValuesToIgnore(); 1282 1283 /// Split reductions into those that happen in the loop, and those that happen 1284 /// outside. In loop reductions are collected into InLoopReductionChains. 1285 void collectInLoopReductions(); 1286 1287 /// \returns The smallest bitwidth each instruction can be represented with. 1288 /// The vector equivalents of these instructions should be truncated to this 1289 /// type. 1290 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1291 return MinBWs; 1292 } 1293 1294 /// \returns True if it is more profitable to scalarize instruction \p I for 1295 /// vectorization factor \p VF. 1296 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1297 assert(VF.isVector() && 1298 "Profitable to scalarize relevant only for VF > 1."); 1299 1300 // Cost model is not run in the VPlan-native path - return conservative 1301 // result until this changes. 1302 if (EnableVPlanNativePath) 1303 return false; 1304 1305 auto Scalars = InstsToScalarize.find(VF); 1306 assert(Scalars != InstsToScalarize.end() && 1307 "VF not yet analyzed for scalarization profitability"); 1308 return Scalars->second.find(I) != Scalars->second.end(); 1309 } 1310 1311 /// Returns true if \p I is known to be uniform after vectorization. 1312 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1313 if (VF.isScalar()) 1314 return true; 1315 1316 // Cost model is not run in the VPlan-native path - return conservative 1317 // result until this changes. 1318 if (EnableVPlanNativePath) 1319 return false; 1320 1321 auto UniformsPerVF = Uniforms.find(VF); 1322 assert(UniformsPerVF != Uniforms.end() && 1323 "VF not yet analyzed for uniformity"); 1324 return UniformsPerVF->second.count(I); 1325 } 1326 1327 /// Returns true if \p I is known to be scalar after vectorization. 1328 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1329 if (VF.isScalar()) 1330 return true; 1331 1332 // Cost model is not run in the VPlan-native path - return conservative 1333 // result until this changes. 1334 if (EnableVPlanNativePath) 1335 return false; 1336 1337 auto ScalarsPerVF = Scalars.find(VF); 1338 assert(ScalarsPerVF != Scalars.end() && 1339 "Scalar values are not calculated for VF"); 1340 return ScalarsPerVF->second.count(I); 1341 } 1342 1343 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1344 /// for vectorization factor \p VF. 1345 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1346 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1347 !isProfitableToScalarize(I, VF) && 1348 !isScalarAfterVectorization(I, VF); 1349 } 1350 1351 /// Decision that was taken during cost calculation for memory instruction. 1352 enum InstWidening { 1353 CM_Unknown, 1354 CM_Widen, // For consecutive accesses with stride +1. 1355 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1356 CM_Interleave, 1357 CM_GatherScatter, 1358 CM_Scalarize 1359 }; 1360 1361 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1362 /// instruction \p I and vector width \p VF. 1363 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1364 unsigned Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1367 } 1368 1369 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1370 /// interleaving group \p Grp and vector width \p VF. 1371 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1372 ElementCount VF, InstWidening W, unsigned Cost) { 1373 assert(VF.isVector() && "Expected VF >=2"); 1374 /// Broadcast this decicion to all instructions inside the group. 1375 /// But the cost will be assigned to one instruction only. 1376 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1377 if (auto *I = Grp->getMember(i)) { 1378 if (Grp->getInsertPos() == I) 1379 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1380 else 1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1382 } 1383 } 1384 } 1385 1386 /// Return the cost model decision for the given instruction \p I and vector 1387 /// width \p VF. Return CM_Unknown if this instruction did not pass 1388 /// through the cost modeling. 1389 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1390 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1391 assert(VF.isVector() && "Expected VF >=2"); 1392 1393 // Cost model is not run in the VPlan-native path - return conservative 1394 // result until this changes. 1395 if (EnableVPlanNativePath) 1396 return CM_GatherScatter; 1397 1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1399 auto Itr = WideningDecisions.find(InstOnVF); 1400 if (Itr == WideningDecisions.end()) 1401 return CM_Unknown; 1402 return Itr->second.first; 1403 } 1404 1405 /// Return the vectorization cost for the given instruction \p I and vector 1406 /// width \p VF. 1407 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1408 assert(VF.isVector() && "Expected VF >=2"); 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1411 "The cost is not calculated"); 1412 return WideningDecisions[InstOnVF].second; 1413 } 1414 1415 /// Return True if instruction \p I is an optimizable truncate whose operand 1416 /// is an induction variable. Such a truncate will be removed by adding a new 1417 /// induction variable with the destination type. 1418 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1419 // If the instruction is not a truncate, return false. 1420 auto *Trunc = dyn_cast<TruncInst>(I); 1421 if (!Trunc) 1422 return false; 1423 1424 // Get the source and destination types of the truncate. 1425 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1426 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1427 1428 // If the truncate is free for the given types, return false. Replacing a 1429 // free truncate with an induction variable would add an induction variable 1430 // update instruction to each iteration of the loop. We exclude from this 1431 // check the primary induction variable since it will need an update 1432 // instruction regardless. 1433 Value *Op = Trunc->getOperand(0); 1434 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1435 return false; 1436 1437 // If the truncated value is not an induction variable, return false. 1438 return Legal->isInductionPhi(Op); 1439 } 1440 1441 /// Collects the instructions to scalarize for each predicated instruction in 1442 /// the loop. 1443 void collectInstsToScalarize(ElementCount VF); 1444 1445 /// Collect Uniform and Scalar values for the given \p VF. 1446 /// The sets depend on CM decision for Load/Store instructions 1447 /// that may be vectorized as interleave, gather-scatter or scalarized. 1448 void collectUniformsAndScalars(ElementCount VF) { 1449 // Do the analysis once. 1450 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1451 return; 1452 setCostBasedWideningDecision(VF); 1453 collectLoopUniforms(VF); 1454 collectLoopScalars(VF); 1455 } 1456 1457 /// Returns true if the target machine supports masked store operation 1458 /// for the given \p DataType and kind of access to \p Ptr. 1459 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1460 return Legal->isConsecutivePtr(Ptr) && 1461 TTI.isLegalMaskedStore(DataType, Alignment); 1462 } 1463 1464 /// Returns true if the target machine supports masked load operation 1465 /// for the given \p DataType and kind of access to \p Ptr. 1466 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1467 return Legal->isConsecutivePtr(Ptr) && 1468 TTI.isLegalMaskedLoad(DataType, Alignment); 1469 } 1470 1471 /// Returns true if the target machine supports masked scatter operation 1472 /// for the given \p DataType. 1473 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1474 return TTI.isLegalMaskedScatter(DataType, Alignment); 1475 } 1476 1477 /// Returns true if the target machine supports masked gather operation 1478 /// for the given \p DataType. 1479 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1480 return TTI.isLegalMaskedGather(DataType, Alignment); 1481 } 1482 1483 /// Returns true if the target machine can represent \p V as a masked gather 1484 /// or scatter operation. 1485 bool isLegalGatherOrScatter(Value *V) { 1486 bool LI = isa<LoadInst>(V); 1487 bool SI = isa<StoreInst>(V); 1488 if (!LI && !SI) 1489 return false; 1490 auto *Ty = getMemInstValueType(V); 1491 Align Align = getLoadStoreAlignment(V); 1492 return (LI && isLegalMaskedGather(Ty, Align)) || 1493 (SI && isLegalMaskedScatter(Ty, Align)); 1494 } 1495 1496 /// Returns true if \p I is an instruction that will be scalarized with 1497 /// predication. Such instructions include conditional stores and 1498 /// instructions that may divide by zero. 1499 /// If a non-zero VF has been calculated, we check if I will be scalarized 1500 /// predication for that VF. 1501 bool isScalarWithPredication(Instruction *I, 1502 ElementCount VF = ElementCount::getFixed(1)); 1503 1504 // Returns true if \p I is an instruction that will be predicated either 1505 // through scalar predication or masked load/store or masked gather/scatter. 1506 // Superset of instructions that return true for isScalarWithPredication. 1507 bool isPredicatedInst(Instruction *I) { 1508 if (!blockNeedsPredication(I->getParent())) 1509 return false; 1510 // Loads and stores that need some form of masked operation are predicated 1511 // instructions. 1512 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1513 return Legal->isMaskRequired(I); 1514 return isScalarWithPredication(I); 1515 } 1516 1517 /// Returns true if \p I is a memory instruction with consecutive memory 1518 /// access that can be widened. 1519 bool 1520 memoryInstructionCanBeWidened(Instruction *I, 1521 ElementCount VF = ElementCount::getFixed(1)); 1522 1523 /// Returns true if \p I is a memory instruction in an interleaved-group 1524 /// of memory accesses that can be vectorized with wide vector loads/stores 1525 /// and shuffles. 1526 bool 1527 interleavedAccessCanBeWidened(Instruction *I, 1528 ElementCount VF = ElementCount::getFixed(1)); 1529 1530 /// Check if \p Instr belongs to any interleaved access group. 1531 bool isAccessInterleaved(Instruction *Instr) { 1532 return InterleaveInfo.isInterleaved(Instr); 1533 } 1534 1535 /// Get the interleaved access group that \p Instr belongs to. 1536 const InterleaveGroup<Instruction> * 1537 getInterleavedAccessGroup(Instruction *Instr) { 1538 return InterleaveInfo.getInterleaveGroup(Instr); 1539 } 1540 1541 /// Returns true if an interleaved group requires a scalar iteration 1542 /// to handle accesses with gaps, and there is nothing preventing us from 1543 /// creating a scalar epilogue. 1544 bool requiresScalarEpilogue() const { 1545 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1546 } 1547 1548 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1549 /// loop hint annotation. 1550 bool isScalarEpilogueAllowed() const { 1551 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1552 } 1553 1554 /// Returns true if all loop blocks should be masked to fold tail loop. 1555 bool foldTailByMasking() const { return FoldTailByMasking; } 1556 1557 bool blockNeedsPredication(BasicBlock *BB) { 1558 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1559 } 1560 1561 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1562 /// nodes to the chain of instructions representing the reductions. Uses a 1563 /// MapVector to ensure deterministic iteration order. 1564 using ReductionChainMap = 1565 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1566 1567 /// Return the chain of instructions representing an inloop reduction. 1568 const ReductionChainMap &getInLoopReductionChains() const { 1569 return InLoopReductionChains; 1570 } 1571 1572 /// Returns true if the Phi is part of an inloop reduction. 1573 bool isInLoopReduction(PHINode *Phi) const { 1574 return InLoopReductionChains.count(Phi); 1575 } 1576 1577 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1578 /// with factor VF. Return the cost of the instruction, including 1579 /// scalarization overhead if it's needed. 1580 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1581 1582 /// Estimate cost of a call instruction CI if it were vectorized with factor 1583 /// VF. Return the cost of the instruction, including scalarization overhead 1584 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1585 /// scalarized - 1586 /// i.e. either vector version isn't available, or is too expensive. 1587 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1588 bool &NeedToScalarize); 1589 1590 /// Invalidates decisions already taken by the cost model. 1591 void invalidateCostModelingDecisions() { 1592 WideningDecisions.clear(); 1593 Uniforms.clear(); 1594 Scalars.clear(); 1595 } 1596 1597 private: 1598 unsigned NumPredStores = 0; 1599 1600 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1601 /// than zero. One is returned if vectorization should best be avoided due 1602 /// to cost. 1603 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1604 ElementCount UserVF); 1605 1606 /// The vectorization cost is a combination of the cost itself and a boolean 1607 /// indicating whether any of the contributing operations will actually 1608 /// operate on 1609 /// vector values after type legalization in the backend. If this latter value 1610 /// is 1611 /// false, then all operations will be scalarized (i.e. no vectorization has 1612 /// actually taken place). 1613 using VectorizationCostTy = std::pair<unsigned, bool>; 1614 1615 /// Returns the expected execution cost. The unit of the cost does 1616 /// not matter because we use the 'cost' units to compare different 1617 /// vector widths. The cost that is returned is *not* normalized by 1618 /// the factor width. 1619 VectorizationCostTy expectedCost(ElementCount VF); 1620 1621 /// Returns the execution time cost of an instruction for a given vector 1622 /// width. Vector width of one means scalar. 1623 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1624 1625 /// The cost-computation logic from getInstructionCost which provides 1626 /// the vector type as an output parameter. 1627 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1628 1629 /// Calculate vectorization cost of memory instruction \p I. 1630 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1631 1632 /// The cost computation for scalarized memory instruction. 1633 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1634 1635 /// The cost computation for interleaving group of memory instructions. 1636 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1637 1638 /// The cost computation for Gather/Scatter instruction. 1639 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost computation for widening instruction \p I with consecutive 1642 /// memory access. 1643 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1644 1645 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1646 /// Load: scalar load + broadcast. 1647 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1648 /// element) 1649 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1650 1651 /// Estimate the overhead of scalarizing an instruction. This is a 1652 /// convenience wrapper for the type-based getScalarizationOverhead API. 1653 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1654 1655 /// Returns whether the instruction is a load or store and will be a emitted 1656 /// as a vector operation. 1657 bool isConsecutiveLoadOrStore(Instruction *I); 1658 1659 /// Returns true if an artificially high cost for emulated masked memrefs 1660 /// should be used. 1661 bool useEmulatedMaskMemRefHack(Instruction *I); 1662 1663 /// Map of scalar integer values to the smallest bitwidth they can be legally 1664 /// represented as. The vector equivalents of these values should be truncated 1665 /// to this type. 1666 MapVector<Instruction *, uint64_t> MinBWs; 1667 1668 /// A type representing the costs for instructions if they were to be 1669 /// scalarized rather than vectorized. The entries are Instruction-Cost 1670 /// pairs. 1671 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1672 1673 /// A set containing all BasicBlocks that are known to present after 1674 /// vectorization as a predicated block. 1675 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1676 1677 /// Records whether it is allowed to have the original scalar loop execute at 1678 /// least once. This may be needed as a fallback loop in case runtime 1679 /// aliasing/dependence checks fail, or to handle the tail/remainder 1680 /// iterations when the trip count is unknown or doesn't divide by the VF, 1681 /// or as a peel-loop to handle gaps in interleave-groups. 1682 /// Under optsize and when the trip count is very small we don't allow any 1683 /// iterations to execute in the scalar loop. 1684 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1685 1686 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1687 bool FoldTailByMasking = false; 1688 1689 /// A map holding scalar costs for different vectorization factors. The 1690 /// presence of a cost for an instruction in the mapping indicates that the 1691 /// instruction will be scalarized when vectorizing with the associated 1692 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1693 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1694 1695 /// Holds the instructions known to be uniform after vectorization. 1696 /// The data is collected per VF. 1697 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1698 1699 /// Holds the instructions known to be scalar after vectorization. 1700 /// The data is collected per VF. 1701 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1702 1703 /// Holds the instructions (address computations) that are forced to be 1704 /// scalarized. 1705 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1706 1707 /// PHINodes of the reductions that should be expanded in-loop along with 1708 /// their associated chains of reduction operations, in program order from top 1709 /// (PHI) to bottom 1710 ReductionChainMap InLoopReductionChains; 1711 1712 /// Returns the expected difference in cost from scalarizing the expression 1713 /// feeding a predicated instruction \p PredInst. The instructions to 1714 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1715 /// non-negative return value implies the expression will be scalarized. 1716 /// Currently, only single-use chains are considered for scalarization. 1717 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1718 ElementCount VF); 1719 1720 /// Collect the instructions that are uniform after vectorization. An 1721 /// instruction is uniform if we represent it with a single scalar value in 1722 /// the vectorized loop corresponding to each vector iteration. Examples of 1723 /// uniform instructions include pointer operands of consecutive or 1724 /// interleaved memory accesses. Note that although uniformity implies an 1725 /// instruction will be scalar, the reverse is not true. In general, a 1726 /// scalarized instruction will be represented by VF scalar values in the 1727 /// vectorized loop, each corresponding to an iteration of the original 1728 /// scalar loop. 1729 void collectLoopUniforms(ElementCount VF); 1730 1731 /// Collect the instructions that are scalar after vectorization. An 1732 /// instruction is scalar if it is known to be uniform or will be scalarized 1733 /// during vectorization. Non-uniform scalarized instructions will be 1734 /// represented by VF values in the vectorized loop, each corresponding to an 1735 /// iteration of the original scalar loop. 1736 void collectLoopScalars(ElementCount VF); 1737 1738 /// Keeps cost model vectorization decision and cost for instructions. 1739 /// Right now it is used for memory instructions only. 1740 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1741 std::pair<InstWidening, unsigned>>; 1742 1743 DecisionList WideningDecisions; 1744 1745 /// Returns true if \p V is expected to be vectorized and it needs to be 1746 /// extracted. 1747 bool needsExtract(Value *V, ElementCount VF) const { 1748 Instruction *I = dyn_cast<Instruction>(V); 1749 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1750 TheLoop->isLoopInvariant(I)) 1751 return false; 1752 1753 // Assume we can vectorize V (and hence we need extraction) if the 1754 // scalars are not computed yet. This can happen, because it is called 1755 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1756 // the scalars are collected. That should be a safe assumption in most 1757 // cases, because we check if the operands have vectorizable types 1758 // beforehand in LoopVectorizationLegality. 1759 return Scalars.find(VF) == Scalars.end() || 1760 !isScalarAfterVectorization(I, VF); 1761 }; 1762 1763 /// Returns a range containing only operands needing to be extracted. 1764 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1765 ElementCount VF) { 1766 return SmallVector<Value *, 4>(make_filter_range( 1767 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1768 } 1769 1770 /// Determines if we have the infrastructure to vectorize loop \p L and its 1771 /// epilogue, assuming the main loop is vectorized by \p VF. 1772 bool isCandidateForEpilogueVectorization(const Loop &L, 1773 const ElementCount VF) const; 1774 1775 /// Returns true if epilogue vectorization is considered profitable, and 1776 /// false otherwise. 1777 /// \p VF is the vectorization factor chosen for the original loop. 1778 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1779 1780 public: 1781 /// The loop that we evaluate. 1782 Loop *TheLoop; 1783 1784 /// Predicated scalar evolution analysis. 1785 PredicatedScalarEvolution &PSE; 1786 1787 /// Loop Info analysis. 1788 LoopInfo *LI; 1789 1790 /// Vectorization legality. 1791 LoopVectorizationLegality *Legal; 1792 1793 /// Vector target information. 1794 const TargetTransformInfo &TTI; 1795 1796 /// Target Library Info. 1797 const TargetLibraryInfo *TLI; 1798 1799 /// Demanded bits analysis. 1800 DemandedBits *DB; 1801 1802 /// Assumption cache. 1803 AssumptionCache *AC; 1804 1805 /// Interface to emit optimization remarks. 1806 OptimizationRemarkEmitter *ORE; 1807 1808 const Function *TheFunction; 1809 1810 /// Loop Vectorize Hint. 1811 const LoopVectorizeHints *Hints; 1812 1813 /// The interleave access information contains groups of interleaved accesses 1814 /// with the same stride and close to each other. 1815 InterleavedAccessInfo &InterleaveInfo; 1816 1817 /// Values to ignore in the cost model. 1818 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1819 1820 /// Values to ignore in the cost model when VF > 1. 1821 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1822 1823 /// Profitable vector factors. 1824 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1825 }; 1826 1827 } // end namespace llvm 1828 1829 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1830 // vectorization. The loop needs to be annotated with #pragma omp simd 1831 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1832 // vector length information is not provided, vectorization is not considered 1833 // explicit. Interleave hints are not allowed either. These limitations will be 1834 // relaxed in the future. 1835 // Please, note that we are currently forced to abuse the pragma 'clang 1836 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1837 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1838 // provides *explicit vectorization hints* (LV can bypass legal checks and 1839 // assume that vectorization is legal). However, both hints are implemented 1840 // using the same metadata (llvm.loop.vectorize, processed by 1841 // LoopVectorizeHints). This will be fixed in the future when the native IR 1842 // representation for pragma 'omp simd' is introduced. 1843 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1844 OptimizationRemarkEmitter *ORE) { 1845 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1846 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1847 1848 // Only outer loops with an explicit vectorization hint are supported. 1849 // Unannotated outer loops are ignored. 1850 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1851 return false; 1852 1853 Function *Fn = OuterLp->getHeader()->getParent(); 1854 if (!Hints.allowVectorization(Fn, OuterLp, 1855 true /*VectorizeOnlyWhenForced*/)) { 1856 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1857 return false; 1858 } 1859 1860 if (Hints.getInterleave() > 1) { 1861 // TODO: Interleave support is future work. 1862 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1863 "outer loops.\n"); 1864 Hints.emitRemarkWithHints(); 1865 return false; 1866 } 1867 1868 return true; 1869 } 1870 1871 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1872 OptimizationRemarkEmitter *ORE, 1873 SmallVectorImpl<Loop *> &V) { 1874 // Collect inner loops and outer loops without irreducible control flow. For 1875 // now, only collect outer loops that have explicit vectorization hints. If we 1876 // are stress testing the VPlan H-CFG construction, we collect the outermost 1877 // loop of every loop nest. 1878 if (L.isInnermost() || VPlanBuildStressTest || 1879 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1880 LoopBlocksRPO RPOT(&L); 1881 RPOT.perform(LI); 1882 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1883 V.push_back(&L); 1884 // TODO: Collect inner loops inside marked outer loops in case 1885 // vectorization fails for the outer loop. Do not invoke 1886 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1887 // already known to be reducible. We can use an inherited attribute for 1888 // that. 1889 return; 1890 } 1891 } 1892 for (Loop *InnerL : L) 1893 collectSupportedLoops(*InnerL, LI, ORE, V); 1894 } 1895 1896 namespace { 1897 1898 /// The LoopVectorize Pass. 1899 struct LoopVectorize : public FunctionPass { 1900 /// Pass identification, replacement for typeid 1901 static char ID; 1902 1903 LoopVectorizePass Impl; 1904 1905 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1906 bool VectorizeOnlyWhenForced = false) 1907 : FunctionPass(ID), 1908 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1909 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1910 } 1911 1912 bool runOnFunction(Function &F) override { 1913 if (skipFunction(F)) 1914 return false; 1915 1916 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1917 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1918 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1919 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1920 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1921 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1922 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1923 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1924 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1925 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1926 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1927 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1928 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1929 1930 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1931 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1932 1933 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1934 GetLAA, *ORE, PSI).MadeAnyChange; 1935 } 1936 1937 void getAnalysisUsage(AnalysisUsage &AU) const override { 1938 AU.addRequired<AssumptionCacheTracker>(); 1939 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1940 AU.addRequired<DominatorTreeWrapperPass>(); 1941 AU.addRequired<LoopInfoWrapperPass>(); 1942 AU.addRequired<ScalarEvolutionWrapperPass>(); 1943 AU.addRequired<TargetTransformInfoWrapperPass>(); 1944 AU.addRequired<AAResultsWrapperPass>(); 1945 AU.addRequired<LoopAccessLegacyAnalysis>(); 1946 AU.addRequired<DemandedBitsWrapperPass>(); 1947 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1948 AU.addRequired<InjectTLIMappingsLegacy>(); 1949 1950 // We currently do not preserve loopinfo/dominator analyses with outer loop 1951 // vectorization. Until this is addressed, mark these analyses as preserved 1952 // only for non-VPlan-native path. 1953 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1954 if (!EnableVPlanNativePath) { 1955 AU.addPreserved<LoopInfoWrapperPass>(); 1956 AU.addPreserved<DominatorTreeWrapperPass>(); 1957 } 1958 1959 AU.addPreserved<BasicAAWrapperPass>(); 1960 AU.addPreserved<GlobalsAAWrapperPass>(); 1961 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1962 } 1963 }; 1964 1965 } // end anonymous namespace 1966 1967 //===----------------------------------------------------------------------===// 1968 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1969 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1970 //===----------------------------------------------------------------------===// 1971 1972 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1973 // We need to place the broadcast of invariant variables outside the loop, 1974 // but only if it's proven safe to do so. Else, broadcast will be inside 1975 // vector loop body. 1976 Instruction *Instr = dyn_cast<Instruction>(V); 1977 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1978 (!Instr || 1979 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1980 // Place the code for broadcasting invariant variables in the new preheader. 1981 IRBuilder<>::InsertPointGuard Guard(Builder); 1982 if (SafeToHoist) 1983 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1984 1985 // Broadcast the scalar into all locations in the vector. 1986 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1987 1988 return Shuf; 1989 } 1990 1991 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1992 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1993 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1994 "Expected either an induction phi-node or a truncate of it!"); 1995 Value *Start = II.getStartValue(); 1996 1997 // Construct the initial value of the vector IV in the vector loop preheader 1998 auto CurrIP = Builder.saveIP(); 1999 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2000 if (isa<TruncInst>(EntryVal)) { 2001 assert(Start->getType()->isIntegerTy() && 2002 "Truncation requires an integer type"); 2003 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2004 Step = Builder.CreateTrunc(Step, TruncType); 2005 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2006 } 2007 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2008 Value *SteppedStart = 2009 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2010 2011 // We create vector phi nodes for both integer and floating-point induction 2012 // variables. Here, we determine the kind of arithmetic we will perform. 2013 Instruction::BinaryOps AddOp; 2014 Instruction::BinaryOps MulOp; 2015 if (Step->getType()->isIntegerTy()) { 2016 AddOp = Instruction::Add; 2017 MulOp = Instruction::Mul; 2018 } else { 2019 AddOp = II.getInductionOpcode(); 2020 MulOp = Instruction::FMul; 2021 } 2022 2023 // Multiply the vectorization factor by the step using integer or 2024 // floating-point arithmetic as appropriate. 2025 Value *ConstVF = 2026 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2027 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2028 2029 // Create a vector splat to use in the induction update. 2030 // 2031 // FIXME: If the step is non-constant, we create the vector splat with 2032 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2033 // handle a constant vector splat. 2034 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2035 Value *SplatVF = isa<Constant>(Mul) 2036 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2037 : Builder.CreateVectorSplat(VF, Mul); 2038 Builder.restoreIP(CurrIP); 2039 2040 // We may need to add the step a number of times, depending on the unroll 2041 // factor. The last of those goes into the PHI. 2042 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2043 &*LoopVectorBody->getFirstInsertionPt()); 2044 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2045 Instruction *LastInduction = VecInd; 2046 for (unsigned Part = 0; Part < UF; ++Part) { 2047 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2048 2049 if (isa<TruncInst>(EntryVal)) 2050 addMetadata(LastInduction, EntryVal); 2051 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2052 2053 LastInduction = cast<Instruction>(addFastMathFlag( 2054 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2055 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2056 } 2057 2058 // Move the last step to the end of the latch block. This ensures consistent 2059 // placement of all induction updates. 2060 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2061 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2062 auto *ICmp = cast<Instruction>(Br->getCondition()); 2063 LastInduction->moveBefore(ICmp); 2064 LastInduction->setName("vec.ind.next"); 2065 2066 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2067 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2068 } 2069 2070 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2071 return Cost->isScalarAfterVectorization(I, VF) || 2072 Cost->isProfitableToScalarize(I, VF); 2073 } 2074 2075 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2076 if (shouldScalarizeInstruction(IV)) 2077 return true; 2078 auto isScalarInst = [&](User *U) -> bool { 2079 auto *I = cast<Instruction>(U); 2080 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2081 }; 2082 return llvm::any_of(IV->users(), isScalarInst); 2083 } 2084 2085 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2086 const InductionDescriptor &ID, const Instruction *EntryVal, 2087 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2088 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2089 "Expected either an induction phi-node or a truncate of it!"); 2090 2091 // This induction variable is not the phi from the original loop but the 2092 // newly-created IV based on the proof that casted Phi is equal to the 2093 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2094 // re-uses the same InductionDescriptor that original IV uses but we don't 2095 // have to do any recording in this case - that is done when original IV is 2096 // processed. 2097 if (isa<TruncInst>(EntryVal)) 2098 return; 2099 2100 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2101 if (Casts.empty()) 2102 return; 2103 // Only the first Cast instruction in the Casts vector is of interest. 2104 // The rest of the Casts (if exist) have no uses outside the 2105 // induction update chain itself. 2106 Instruction *CastInst = *Casts.begin(); 2107 if (Lane < UINT_MAX) 2108 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2109 else 2110 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2111 } 2112 2113 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 2114 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2115 "Primary induction variable must have an integer type"); 2116 2117 auto II = Legal->getInductionVars().find(IV); 2118 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2119 2120 auto ID = II->second; 2121 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2122 2123 // The value from the original loop to which we are mapping the new induction 2124 // variable. 2125 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2126 2127 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2128 2129 // Generate code for the induction step. Note that induction steps are 2130 // required to be loop-invariant 2131 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2132 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2133 "Induction step should be loop invariant"); 2134 if (PSE.getSE()->isSCEVable(IV->getType())) { 2135 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2136 return Exp.expandCodeFor(Step, Step->getType(), 2137 LoopVectorPreHeader->getTerminator()); 2138 } 2139 return cast<SCEVUnknown>(Step)->getValue(); 2140 }; 2141 2142 // The scalar value to broadcast. This is derived from the canonical 2143 // induction variable. If a truncation type is given, truncate the canonical 2144 // induction variable and step. Otherwise, derive these values from the 2145 // induction descriptor. 2146 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2147 Value *ScalarIV = Induction; 2148 if (IV != OldInduction) { 2149 ScalarIV = IV->getType()->isIntegerTy() 2150 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2151 : Builder.CreateCast(Instruction::SIToFP, Induction, 2152 IV->getType()); 2153 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2154 ScalarIV->setName("offset.idx"); 2155 } 2156 if (Trunc) { 2157 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2158 assert(Step->getType()->isIntegerTy() && 2159 "Truncation requires an integer step"); 2160 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2161 Step = Builder.CreateTrunc(Step, TruncType); 2162 } 2163 return ScalarIV; 2164 }; 2165 2166 // Create the vector values from the scalar IV, in the absence of creating a 2167 // vector IV. 2168 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2169 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2170 for (unsigned Part = 0; Part < UF; ++Part) { 2171 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2172 Value *EntryPart = 2173 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2174 ID.getInductionOpcode()); 2175 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2176 if (Trunc) 2177 addMetadata(EntryPart, Trunc); 2178 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2179 } 2180 }; 2181 2182 // Now do the actual transformations, and start with creating the step value. 2183 Value *Step = CreateStepValue(ID.getStep()); 2184 if (VF.isZero() || VF.isScalar()) { 2185 Value *ScalarIV = CreateScalarIV(Step); 2186 CreateSplatIV(ScalarIV, Step); 2187 return; 2188 } 2189 2190 // Determine if we want a scalar version of the induction variable. This is 2191 // true if the induction variable itself is not widened, or if it has at 2192 // least one user in the loop that is not widened. 2193 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2194 if (!NeedsScalarIV) { 2195 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2196 return; 2197 } 2198 2199 // Try to create a new independent vector induction variable. If we can't 2200 // create the phi node, we will splat the scalar induction variable in each 2201 // loop iteration. 2202 if (!shouldScalarizeInstruction(EntryVal)) { 2203 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2204 Value *ScalarIV = CreateScalarIV(Step); 2205 // Create scalar steps that can be used by instructions we will later 2206 // scalarize. Note that the addition of the scalar steps will not increase 2207 // the number of instructions in the loop in the common case prior to 2208 // InstCombine. We will be trading one vector extract for each scalar step. 2209 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2210 return; 2211 } 2212 2213 // All IV users are scalar instructions, so only emit a scalar IV, not a 2214 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2215 // predicate used by the masked loads/stores. 2216 Value *ScalarIV = CreateScalarIV(Step); 2217 if (!Cost->isScalarEpilogueAllowed()) 2218 CreateSplatIV(ScalarIV, Step); 2219 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2220 } 2221 2222 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2223 Instruction::BinaryOps BinOp) { 2224 // Create and check the types. 2225 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2226 int VLen = ValVTy->getNumElements(); 2227 2228 Type *STy = Val->getType()->getScalarType(); 2229 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2230 "Induction Step must be an integer or FP"); 2231 assert(Step->getType() == STy && "Step has wrong type"); 2232 2233 SmallVector<Constant *, 8> Indices; 2234 2235 if (STy->isIntegerTy()) { 2236 // Create a vector of consecutive numbers from zero to VF. 2237 for (int i = 0; i < VLen; ++i) 2238 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2239 2240 // Add the consecutive indices to the vector value. 2241 Constant *Cv = ConstantVector::get(Indices); 2242 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2243 Step = Builder.CreateVectorSplat(VLen, Step); 2244 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2245 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2246 // which can be found from the original scalar operations. 2247 Step = Builder.CreateMul(Cv, Step); 2248 return Builder.CreateAdd(Val, Step, "induction"); 2249 } 2250 2251 // Floating point induction. 2252 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2253 "Binary Opcode should be specified for FP induction"); 2254 // Create a vector of consecutive numbers from zero to VF. 2255 for (int i = 0; i < VLen; ++i) 2256 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2257 2258 // Add the consecutive indices to the vector value. 2259 Constant *Cv = ConstantVector::get(Indices); 2260 2261 Step = Builder.CreateVectorSplat(VLen, Step); 2262 2263 // Floating point operations had to be 'fast' to enable the induction. 2264 FastMathFlags Flags; 2265 Flags.setFast(); 2266 2267 Value *MulOp = Builder.CreateFMul(Cv, Step); 2268 if (isa<Instruction>(MulOp)) 2269 // Have to check, MulOp may be a constant 2270 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2271 2272 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2273 if (isa<Instruction>(BOp)) 2274 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2275 return BOp; 2276 } 2277 2278 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2279 Instruction *EntryVal, 2280 const InductionDescriptor &ID) { 2281 // We shouldn't have to build scalar steps if we aren't vectorizing. 2282 assert(VF.isVector() && "VF should be greater than one"); 2283 assert(!VF.isScalable() && 2284 "the code below assumes a fixed number of elements at compile time"); 2285 // Get the value type and ensure it and the step have the same integer type. 2286 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2287 assert(ScalarIVTy == Step->getType() && 2288 "Val and Step should have the same type"); 2289 2290 // We build scalar steps for both integer and floating-point induction 2291 // variables. Here, we determine the kind of arithmetic we will perform. 2292 Instruction::BinaryOps AddOp; 2293 Instruction::BinaryOps MulOp; 2294 if (ScalarIVTy->isIntegerTy()) { 2295 AddOp = Instruction::Add; 2296 MulOp = Instruction::Mul; 2297 } else { 2298 AddOp = ID.getInductionOpcode(); 2299 MulOp = Instruction::FMul; 2300 } 2301 2302 // Determine the number of scalars we need to generate for each unroll 2303 // iteration. If EntryVal is uniform, we only need to generate the first 2304 // lane. Otherwise, we generate all VF values. 2305 unsigned Lanes = 2306 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2307 ? 1 2308 : VF.getKnownMinValue(); 2309 // Compute the scalar steps and save the results in VectorLoopValueMap. 2310 for (unsigned Part = 0; Part < UF; ++Part) { 2311 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2312 auto *StartIdx = getSignedIntOrFpConstant( 2313 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2314 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2315 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2316 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2317 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2318 } 2319 } 2320 } 2321 2322 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2323 assert(V != Induction && "The new induction variable should not be used."); 2324 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2325 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2326 2327 // If we have a stride that is replaced by one, do it here. Defer this for 2328 // the VPlan-native path until we start running Legal checks in that path. 2329 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2330 V = ConstantInt::get(V->getType(), 1); 2331 2332 // If we have a vector mapped to this value, return it. 2333 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2334 return VectorLoopValueMap.getVectorValue(V, Part); 2335 2336 // If the value has not been vectorized, check if it has been scalarized 2337 // instead. If it has been scalarized, and we actually need the value in 2338 // vector form, we will construct the vector values on demand. 2339 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2340 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2341 2342 // If we've scalarized a value, that value should be an instruction. 2343 auto *I = cast<Instruction>(V); 2344 2345 // If we aren't vectorizing, we can just copy the scalar map values over to 2346 // the vector map. 2347 if (VF.isScalar()) { 2348 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2349 return ScalarValue; 2350 } 2351 2352 // Get the last scalar instruction we generated for V and Part. If the value 2353 // is known to be uniform after vectorization, this corresponds to lane zero 2354 // of the Part unroll iteration. Otherwise, the last instruction is the one 2355 // we created for the last vector lane of the Part unroll iteration. 2356 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2357 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2358 ? 0 2359 : VF.getKnownMinValue() - 1; 2360 auto *LastInst = cast<Instruction>( 2361 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2362 2363 // Set the insert point after the last scalarized instruction. This ensures 2364 // the insertelement sequence will directly follow the scalar definitions. 2365 auto OldIP = Builder.saveIP(); 2366 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2367 Builder.SetInsertPoint(&*NewIP); 2368 2369 // However, if we are vectorizing, we need to construct the vector values. 2370 // If the value is known to be uniform after vectorization, we can just 2371 // broadcast the scalar value corresponding to lane zero for each unroll 2372 // iteration. Otherwise, we construct the vector values using insertelement 2373 // instructions. Since the resulting vectors are stored in 2374 // VectorLoopValueMap, we will only generate the insertelements once. 2375 Value *VectorValue = nullptr; 2376 if (Cost->isUniformAfterVectorization(I, VF)) { 2377 VectorValue = getBroadcastInstrs(ScalarValue); 2378 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2379 } else { 2380 // Initialize packing with insertelements to start from undef. 2381 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2382 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2383 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2384 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2385 packScalarIntoVectorValue(V, {Part, Lane}); 2386 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2387 } 2388 Builder.restoreIP(OldIP); 2389 return VectorValue; 2390 } 2391 2392 // If this scalar is unknown, assume that it is a constant or that it is 2393 // loop invariant. Broadcast V and save the value for future uses. 2394 Value *B = getBroadcastInstrs(V); 2395 VectorLoopValueMap.setVectorValue(V, Part, B); 2396 return B; 2397 } 2398 2399 Value * 2400 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2401 const VPIteration &Instance) { 2402 // If the value is not an instruction contained in the loop, it should 2403 // already be scalar. 2404 if (OrigLoop->isLoopInvariant(V)) 2405 return V; 2406 2407 assert(Instance.Lane > 0 2408 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2409 : true && "Uniform values only have lane zero"); 2410 2411 // If the value from the original loop has not been vectorized, it is 2412 // represented by UF x VF scalar values in the new loop. Return the requested 2413 // scalar value. 2414 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2415 return VectorLoopValueMap.getScalarValue(V, Instance); 2416 2417 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2418 // for the given unroll part. If this entry is not a vector type (i.e., the 2419 // vectorization factor is one), there is no need to generate an 2420 // extractelement instruction. 2421 auto *U = getOrCreateVectorValue(V, Instance.Part); 2422 if (!U->getType()->isVectorTy()) { 2423 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2424 return U; 2425 } 2426 2427 // Otherwise, the value from the original loop has been vectorized and is 2428 // represented by UF vector values. Extract and return the requested scalar 2429 // value from the appropriate vector lane. 2430 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2431 } 2432 2433 void InnerLoopVectorizer::packScalarIntoVectorValue( 2434 Value *V, const VPIteration &Instance) { 2435 assert(V != Induction && "The new induction variable should not be used."); 2436 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2437 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2438 2439 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2440 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2441 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2442 Builder.getInt32(Instance.Lane)); 2443 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2444 } 2445 2446 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2447 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2448 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2449 SmallVector<int, 8> ShuffleMask; 2450 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2451 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2452 2453 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2454 } 2455 2456 // Return whether we allow using masked interleave-groups (for dealing with 2457 // strided loads/stores that reside in predicated blocks, or for dealing 2458 // with gaps). 2459 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2460 // If an override option has been passed in for interleaved accesses, use it. 2461 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2462 return EnableMaskedInterleavedMemAccesses; 2463 2464 return TTI.enableMaskedInterleavedAccessVectorization(); 2465 } 2466 2467 // Try to vectorize the interleave group that \p Instr belongs to. 2468 // 2469 // E.g. Translate following interleaved load group (factor = 3): 2470 // for (i = 0; i < N; i+=3) { 2471 // R = Pic[i]; // Member of index 0 2472 // G = Pic[i+1]; // Member of index 1 2473 // B = Pic[i+2]; // Member of index 2 2474 // ... // do something to R, G, B 2475 // } 2476 // To: 2477 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2478 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2479 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2480 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2481 // 2482 // Or translate following interleaved store group (factor = 3): 2483 // for (i = 0; i < N; i+=3) { 2484 // ... do something to R, G, B 2485 // Pic[i] = R; // Member of index 0 2486 // Pic[i+1] = G; // Member of index 1 2487 // Pic[i+2] = B; // Member of index 2 2488 // } 2489 // To: 2490 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2491 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2492 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2493 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2494 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2495 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2496 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2497 VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) { 2498 Instruction *Instr = Group->getInsertPos(); 2499 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2500 2501 // Prepare for the vector type of the interleaved load/store. 2502 Type *ScalarTy = getMemInstValueType(Instr); 2503 unsigned InterleaveFactor = Group->getFactor(); 2504 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2505 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2506 2507 // Prepare for the new pointers. 2508 SmallVector<Value *, 2> AddrParts; 2509 unsigned Index = Group->getIndex(Instr); 2510 2511 // TODO: extend the masked interleaved-group support to reversed access. 2512 assert((!BlockInMask || !Group->isReverse()) && 2513 "Reversed masked interleave-group not supported."); 2514 2515 // If the group is reverse, adjust the index to refer to the last vector lane 2516 // instead of the first. We adjust the index from the first vector lane, 2517 // rather than directly getting the pointer for lane VF - 1, because the 2518 // pointer operand of the interleaved access is supposed to be uniform. For 2519 // uniform instructions, we're only required to generate a value for the 2520 // first vector lane in each unroll iteration. 2521 assert(!VF.isScalable() && 2522 "scalable vector reverse operation is not implemented"); 2523 if (Group->isReverse()) 2524 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2525 2526 for (unsigned Part = 0; Part < UF; Part++) { 2527 Value *AddrPart = State.get(Addr, {Part, 0}); 2528 setDebugLocFromInst(Builder, AddrPart); 2529 2530 // Notice current instruction could be any index. Need to adjust the address 2531 // to the member of index 0. 2532 // 2533 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2534 // b = A[i]; // Member of index 0 2535 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2536 // 2537 // E.g. A[i+1] = a; // Member of index 1 2538 // A[i] = b; // Member of index 0 2539 // A[i+2] = c; // Member of index 2 (Current instruction) 2540 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2541 2542 bool InBounds = false; 2543 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2544 InBounds = gep->isInBounds(); 2545 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2546 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2547 2548 // Cast to the vector pointer type. 2549 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2550 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2551 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2552 } 2553 2554 setDebugLocFromInst(Builder, Instr); 2555 Value *UndefVec = UndefValue::get(VecTy); 2556 2557 Value *MaskForGaps = nullptr; 2558 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2559 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2560 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2561 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2562 } 2563 2564 // Vectorize the interleaved load group. 2565 if (isa<LoadInst>(Instr)) { 2566 // For each unroll part, create a wide load for the group. 2567 SmallVector<Value *, 2> NewLoads; 2568 for (unsigned Part = 0; Part < UF; Part++) { 2569 Instruction *NewLoad; 2570 if (BlockInMask || MaskForGaps) { 2571 assert(useMaskedInterleavedAccesses(*TTI) && 2572 "masked interleaved groups are not allowed."); 2573 Value *GroupMask = MaskForGaps; 2574 if (BlockInMask) { 2575 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2576 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2577 Value *ShuffledMask = Builder.CreateShuffleVector( 2578 BlockInMaskPart, 2579 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2580 "interleaved.mask"); 2581 GroupMask = MaskForGaps 2582 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2583 MaskForGaps) 2584 : ShuffledMask; 2585 } 2586 NewLoad = 2587 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2588 GroupMask, UndefVec, "wide.masked.vec"); 2589 } 2590 else 2591 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2592 Group->getAlign(), "wide.vec"); 2593 Group->addMetadata(NewLoad); 2594 NewLoads.push_back(NewLoad); 2595 } 2596 2597 // For each member in the group, shuffle out the appropriate data from the 2598 // wide loads. 2599 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2600 Instruction *Member = Group->getMember(I); 2601 2602 // Skip the gaps in the group. 2603 if (!Member) 2604 continue; 2605 2606 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2607 auto StrideMask = 2608 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2609 for (unsigned Part = 0; Part < UF; Part++) { 2610 Value *StridedVec = Builder.CreateShuffleVector( 2611 NewLoads[Part], StrideMask, "strided.vec"); 2612 2613 // If this member has different type, cast the result type. 2614 if (Member->getType() != ScalarTy) { 2615 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2616 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2617 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2618 } 2619 2620 if (Group->isReverse()) 2621 StridedVec = reverseVector(StridedVec); 2622 2623 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2624 } 2625 } 2626 return; 2627 } 2628 2629 // The sub vector type for current instruction. 2630 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2631 auto *SubVT = VectorType::get(ScalarTy, VF); 2632 2633 // Vectorize the interleaved store group. 2634 for (unsigned Part = 0; Part < UF; Part++) { 2635 // Collect the stored vector from each member. 2636 SmallVector<Value *, 4> StoredVecs; 2637 for (unsigned i = 0; i < InterleaveFactor; i++) { 2638 // Interleaved store group doesn't allow a gap, so each index has a member 2639 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2640 2641 Value *StoredVec = State.get(StoredValues[i], Part); 2642 2643 if (Group->isReverse()) 2644 StoredVec = reverseVector(StoredVec); 2645 2646 // If this member has different type, cast it to a unified type. 2647 2648 if (StoredVec->getType() != SubVT) 2649 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2650 2651 StoredVecs.push_back(StoredVec); 2652 } 2653 2654 // Concatenate all vectors into a wide vector. 2655 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2656 2657 // Interleave the elements in the wide vector. 2658 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2659 Value *IVec = Builder.CreateShuffleVector( 2660 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2661 "interleaved.vec"); 2662 2663 Instruction *NewStoreInstr; 2664 if (BlockInMask) { 2665 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2666 Value *ShuffledMask = Builder.CreateShuffleVector( 2667 BlockInMaskPart, 2668 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2669 "interleaved.mask"); 2670 NewStoreInstr = Builder.CreateMaskedStore( 2671 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2672 } 2673 else 2674 NewStoreInstr = 2675 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2676 2677 Group->addMetadata(NewStoreInstr); 2678 } 2679 } 2680 2681 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2682 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2683 VPValue *StoredValue, VPValue *BlockInMask) { 2684 // Attempt to issue a wide load. 2685 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2686 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2687 2688 assert((LI || SI) && "Invalid Load/Store instruction"); 2689 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2690 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2691 2692 LoopVectorizationCostModel::InstWidening Decision = 2693 Cost->getWideningDecision(Instr, VF); 2694 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2695 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2696 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2697 "CM decision is not to widen the memory instruction"); 2698 2699 Type *ScalarDataTy = getMemInstValueType(Instr); 2700 2701 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2702 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2703 const Align Alignment = getLoadStoreAlignment(Instr); 2704 2705 // Determine if the pointer operand of the access is either consecutive or 2706 // reverse consecutive. 2707 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2708 bool ConsecutiveStride = 2709 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2710 bool CreateGatherScatter = 2711 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2712 2713 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2714 // gather/scatter. Otherwise Decision should have been to Scalarize. 2715 assert((ConsecutiveStride || CreateGatherScatter) && 2716 "The instruction should be scalarized"); 2717 (void)ConsecutiveStride; 2718 2719 VectorParts BlockInMaskParts(UF); 2720 bool isMaskRequired = BlockInMask; 2721 if (isMaskRequired) 2722 for (unsigned Part = 0; Part < UF; ++Part) 2723 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2724 2725 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2726 // Calculate the pointer for the specific unroll-part. 2727 GetElementPtrInst *PartPtr = nullptr; 2728 2729 bool InBounds = false; 2730 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2731 InBounds = gep->isInBounds(); 2732 2733 if (Reverse) { 2734 // If the address is consecutive but reversed, then the 2735 // wide store needs to start at the last vector element. 2736 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2737 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2738 PartPtr->setIsInBounds(InBounds); 2739 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2740 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2741 PartPtr->setIsInBounds(InBounds); 2742 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2743 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2744 } else { 2745 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2746 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2747 PartPtr->setIsInBounds(InBounds); 2748 } 2749 2750 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2751 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2752 }; 2753 2754 // Handle Stores: 2755 if (SI) { 2756 setDebugLocFromInst(Builder, SI); 2757 2758 for (unsigned Part = 0; Part < UF; ++Part) { 2759 Instruction *NewSI = nullptr; 2760 Value *StoredVal = State.get(StoredValue, Part); 2761 if (CreateGatherScatter) { 2762 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2763 Value *VectorGep = State.get(Addr, Part); 2764 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2765 MaskPart); 2766 } else { 2767 if (Reverse) { 2768 // If we store to reverse consecutive memory locations, then we need 2769 // to reverse the order of elements in the stored value. 2770 StoredVal = reverseVector(StoredVal); 2771 // We don't want to update the value in the map as it might be used in 2772 // another expression. So don't call resetVectorValue(StoredVal). 2773 } 2774 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2775 if (isMaskRequired) 2776 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2777 BlockInMaskParts[Part]); 2778 else 2779 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2780 } 2781 addMetadata(NewSI, SI); 2782 } 2783 return; 2784 } 2785 2786 // Handle loads. 2787 assert(LI && "Must have a load instruction"); 2788 setDebugLocFromInst(Builder, LI); 2789 for (unsigned Part = 0; Part < UF; ++Part) { 2790 Value *NewLI; 2791 if (CreateGatherScatter) { 2792 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2793 Value *VectorGep = State.get(Addr, Part); 2794 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2795 nullptr, "wide.masked.gather"); 2796 addMetadata(NewLI, LI); 2797 } else { 2798 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2799 if (isMaskRequired) 2800 NewLI = Builder.CreateMaskedLoad( 2801 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2802 "wide.masked.load"); 2803 else 2804 NewLI = 2805 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2806 2807 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2808 addMetadata(NewLI, LI); 2809 if (Reverse) 2810 NewLI = reverseVector(NewLI); 2811 } 2812 2813 State.set(Def, Instr, NewLI, Part); 2814 } 2815 } 2816 2817 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2818 const VPIteration &Instance, 2819 bool IfPredicateInstr, 2820 VPTransformState &State) { 2821 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2822 2823 setDebugLocFromInst(Builder, Instr); 2824 2825 // Does this instruction return a value ? 2826 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2827 2828 Instruction *Cloned = Instr->clone(); 2829 if (!IsVoidRetTy) 2830 Cloned->setName(Instr->getName() + ".cloned"); 2831 2832 // Replace the operands of the cloned instructions with their scalar 2833 // equivalents in the new loop. 2834 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2835 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2836 auto InputInstance = Instance; 2837 if (!Operand || !OrigLoop->contains(Operand) || 2838 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2839 InputInstance.Lane = 0; 2840 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2841 Cloned->setOperand(op, NewOp); 2842 } 2843 addNewMetadata(Cloned, Instr); 2844 2845 // Place the cloned scalar in the new loop. 2846 Builder.Insert(Cloned); 2847 2848 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2849 // representing scalar values in VPTransformState. Add the cloned scalar to 2850 // the scalar map entry. 2851 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2852 2853 // If we just cloned a new assumption, add it the assumption cache. 2854 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2855 if (II->getIntrinsicID() == Intrinsic::assume) 2856 AC->registerAssumption(II); 2857 2858 // End if-block. 2859 if (IfPredicateInstr) 2860 PredicatedInstructions.push_back(Cloned); 2861 } 2862 2863 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2864 Value *End, Value *Step, 2865 Instruction *DL) { 2866 BasicBlock *Header = L->getHeader(); 2867 BasicBlock *Latch = L->getLoopLatch(); 2868 // As we're just creating this loop, it's possible no latch exists 2869 // yet. If so, use the header as this will be a single block loop. 2870 if (!Latch) 2871 Latch = Header; 2872 2873 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2874 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2875 setDebugLocFromInst(Builder, OldInst); 2876 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2877 2878 Builder.SetInsertPoint(Latch->getTerminator()); 2879 setDebugLocFromInst(Builder, OldInst); 2880 2881 // Create i+1 and fill the PHINode. 2882 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2883 Induction->addIncoming(Start, L->getLoopPreheader()); 2884 Induction->addIncoming(Next, Latch); 2885 // Create the compare. 2886 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2887 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2888 2889 // Now we have two terminators. Remove the old one from the block. 2890 Latch->getTerminator()->eraseFromParent(); 2891 2892 return Induction; 2893 } 2894 2895 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2896 if (TripCount) 2897 return TripCount; 2898 2899 assert(L && "Create Trip Count for null loop."); 2900 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2901 // Find the loop boundaries. 2902 ScalarEvolution *SE = PSE.getSE(); 2903 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2904 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2905 "Invalid loop count"); 2906 2907 Type *IdxTy = Legal->getWidestInductionType(); 2908 assert(IdxTy && "No type for induction"); 2909 2910 // The exit count might have the type of i64 while the phi is i32. This can 2911 // happen if we have an induction variable that is sign extended before the 2912 // compare. The only way that we get a backedge taken count is that the 2913 // induction variable was signed and as such will not overflow. In such a case 2914 // truncation is legal. 2915 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2916 IdxTy->getPrimitiveSizeInBits()) 2917 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2918 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2919 2920 // Get the total trip count from the count by adding 1. 2921 const SCEV *ExitCount = SE->getAddExpr( 2922 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2923 2924 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2925 2926 // Expand the trip count and place the new instructions in the preheader. 2927 // Notice that the pre-header does not change, only the loop body. 2928 SCEVExpander Exp(*SE, DL, "induction"); 2929 2930 // Count holds the overall loop count (N). 2931 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2932 L->getLoopPreheader()->getTerminator()); 2933 2934 if (TripCount->getType()->isPointerTy()) 2935 TripCount = 2936 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2937 L->getLoopPreheader()->getTerminator()); 2938 2939 return TripCount; 2940 } 2941 2942 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2943 if (VectorTripCount) 2944 return VectorTripCount; 2945 2946 Value *TC = getOrCreateTripCount(L); 2947 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2948 2949 Type *Ty = TC->getType(); 2950 // This is where we can make the step a runtime constant. 2951 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2952 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2953 2954 // If the tail is to be folded by masking, round the number of iterations N 2955 // up to a multiple of Step instead of rounding down. This is done by first 2956 // adding Step-1 and then rounding down. Note that it's ok if this addition 2957 // overflows: the vector induction variable will eventually wrap to zero given 2958 // that it starts at zero and its Step is a power of two; the loop will then 2959 // exit, with the last early-exit vector comparison also producing all-true. 2960 if (Cost->foldTailByMasking()) { 2961 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2962 "VF*UF must be a power of 2 when folding tail by masking"); 2963 TC = Builder.CreateAdd( 2964 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2965 } 2966 2967 // Now we need to generate the expression for the part of the loop that the 2968 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2969 // iterations are not required for correctness, or N - Step, otherwise. Step 2970 // is equal to the vectorization factor (number of SIMD elements) times the 2971 // unroll factor (number of SIMD instructions). 2972 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2973 2974 // If there is a non-reversed interleaved group that may speculatively access 2975 // memory out-of-bounds, we need to ensure that there will be at least one 2976 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2977 // the trip count, we set the remainder to be equal to the step. If the step 2978 // does not evenly divide the trip count, no adjustment is necessary since 2979 // there will already be scalar iterations. Note that the minimum iterations 2980 // check ensures that N >= Step. 2981 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2982 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2983 R = Builder.CreateSelect(IsZero, Step, R); 2984 } 2985 2986 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2987 2988 return VectorTripCount; 2989 } 2990 2991 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2992 const DataLayout &DL) { 2993 // Verify that V is a vector type with same number of elements as DstVTy. 2994 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2995 unsigned VF = DstFVTy->getNumElements(); 2996 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2997 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2998 Type *SrcElemTy = SrcVecTy->getElementType(); 2999 Type *DstElemTy = DstFVTy->getElementType(); 3000 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3001 "Vector elements must have same size"); 3002 3003 // Do a direct cast if element types are castable. 3004 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3005 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3006 } 3007 // V cannot be directly casted to desired vector type. 3008 // May happen when V is a floating point vector but DstVTy is a vector of 3009 // pointers or vice-versa. Handle this using a two-step bitcast using an 3010 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3011 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3012 "Only one type should be a pointer type"); 3013 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3014 "Only one type should be a floating point type"); 3015 Type *IntTy = 3016 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3017 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3018 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3019 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3020 } 3021 3022 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3023 BasicBlock *Bypass) { 3024 Value *Count = getOrCreateTripCount(L); 3025 // Reuse existing vector loop preheader for TC checks. 3026 // Note that new preheader block is generated for vector loop. 3027 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3028 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3029 3030 // Generate code to check if the loop's trip count is less than VF * UF, or 3031 // equal to it in case a scalar epilogue is required; this implies that the 3032 // vector trip count is zero. This check also covers the case where adding one 3033 // to the backedge-taken count overflowed leading to an incorrect trip count 3034 // of zero. In this case we will also jump to the scalar loop. 3035 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3036 : ICmpInst::ICMP_ULT; 3037 3038 // If tail is to be folded, vector loop takes care of all iterations. 3039 Value *CheckMinIters = Builder.getFalse(); 3040 if (!Cost->foldTailByMasking()) { 3041 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3042 CheckMinIters = Builder.CreateICmp( 3043 P, Count, 3044 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 3045 "min.iters.check"); 3046 } 3047 // Create new preheader for vector loop. 3048 LoopVectorPreHeader = 3049 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3050 "vector.ph"); 3051 3052 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3053 DT->getNode(Bypass)->getIDom()) && 3054 "TC check is expected to dominate Bypass"); 3055 3056 // Update dominator for Bypass & LoopExit. 3057 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3058 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3059 3060 ReplaceInstWithInst( 3061 TCCheckBlock->getTerminator(), 3062 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3063 LoopBypassBlocks.push_back(TCCheckBlock); 3064 } 3065 3066 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3067 // Reuse existing vector loop preheader for SCEV checks. 3068 // Note that new preheader block is generated for vector loop. 3069 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3070 3071 // Generate the code to check that the SCEV assumptions that we made. 3072 // We want the new basic block to start at the first instruction in a 3073 // sequence of instructions that form a check. 3074 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3075 "scev.check"); 3076 Value *SCEVCheck = Exp.expandCodeForPredicate( 3077 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3078 3079 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3080 if (C->isZero()) 3081 return; 3082 3083 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3084 (OptForSizeBasedOnProfile && 3085 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3086 "Cannot SCEV check stride or overflow when optimizing for size"); 3087 3088 SCEVCheckBlock->setName("vector.scevcheck"); 3089 // Create new preheader for vector loop. 3090 LoopVectorPreHeader = 3091 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3092 nullptr, "vector.ph"); 3093 3094 // Update dominator only if this is first RT check. 3095 if (LoopBypassBlocks.empty()) { 3096 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3097 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3098 } 3099 3100 ReplaceInstWithInst( 3101 SCEVCheckBlock->getTerminator(), 3102 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3103 LoopBypassBlocks.push_back(SCEVCheckBlock); 3104 AddedSafetyChecks = true; 3105 } 3106 3107 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3108 // VPlan-native path does not do any analysis for runtime checks currently. 3109 if (EnableVPlanNativePath) 3110 return; 3111 3112 // Reuse existing vector loop preheader for runtime memory checks. 3113 // Note that new preheader block is generated for vector loop. 3114 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3115 3116 // Generate the code that checks in runtime if arrays overlap. We put the 3117 // checks into a separate block to make the more common case of few elements 3118 // faster. 3119 auto *LAI = Legal->getLAI(); 3120 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3121 if (!RtPtrChecking.Need) 3122 return; 3123 3124 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3125 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3126 "Cannot emit memory checks when optimizing for size, unless forced " 3127 "to vectorize."); 3128 ORE->emit([&]() { 3129 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3130 L->getStartLoc(), L->getHeader()) 3131 << "Code-size may be reduced by not forcing " 3132 "vectorization, or by source-code modifications " 3133 "eliminating the need for runtime checks " 3134 "(e.g., adding 'restrict')."; 3135 }); 3136 } 3137 3138 MemCheckBlock->setName("vector.memcheck"); 3139 // Create new preheader for vector loop. 3140 LoopVectorPreHeader = 3141 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3142 "vector.ph"); 3143 3144 auto *CondBranch = cast<BranchInst>( 3145 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3146 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3147 LoopBypassBlocks.push_back(MemCheckBlock); 3148 AddedSafetyChecks = true; 3149 3150 // Update dominator only if this is first RT check. 3151 if (LoopBypassBlocks.empty()) { 3152 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3153 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3154 } 3155 3156 Instruction *FirstCheckInst; 3157 Instruction *MemRuntimeCheck; 3158 std::tie(FirstCheckInst, MemRuntimeCheck) = 3159 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3160 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3161 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3162 "claimed checks are required"); 3163 CondBranch->setCondition(MemRuntimeCheck); 3164 3165 // We currently don't use LoopVersioning for the actual loop cloning but we 3166 // still use it to add the noalias metadata. 3167 LVer = std::make_unique<LoopVersioning>( 3168 *Legal->getLAI(), 3169 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3170 DT, PSE.getSE()); 3171 LVer->prepareNoAliasMetadata(); 3172 } 3173 3174 Value *InnerLoopVectorizer::emitTransformedIndex( 3175 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3176 const InductionDescriptor &ID) const { 3177 3178 SCEVExpander Exp(*SE, DL, "induction"); 3179 auto Step = ID.getStep(); 3180 auto StartValue = ID.getStartValue(); 3181 assert(Index->getType() == Step->getType() && 3182 "Index type does not match StepValue type"); 3183 3184 // Note: the IR at this point is broken. We cannot use SE to create any new 3185 // SCEV and then expand it, hoping that SCEV's simplification will give us 3186 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3187 // lead to various SCEV crashes. So all we can do is to use builder and rely 3188 // on InstCombine for future simplifications. Here we handle some trivial 3189 // cases only. 3190 auto CreateAdd = [&B](Value *X, Value *Y) { 3191 assert(X->getType() == Y->getType() && "Types don't match!"); 3192 if (auto *CX = dyn_cast<ConstantInt>(X)) 3193 if (CX->isZero()) 3194 return Y; 3195 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3196 if (CY->isZero()) 3197 return X; 3198 return B.CreateAdd(X, Y); 3199 }; 3200 3201 auto CreateMul = [&B](Value *X, Value *Y) { 3202 assert(X->getType() == Y->getType() && "Types don't match!"); 3203 if (auto *CX = dyn_cast<ConstantInt>(X)) 3204 if (CX->isOne()) 3205 return Y; 3206 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3207 if (CY->isOne()) 3208 return X; 3209 return B.CreateMul(X, Y); 3210 }; 3211 3212 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3213 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3214 // the DomTree is not kept up-to-date for additional blocks generated in the 3215 // vector loop. By using the header as insertion point, we guarantee that the 3216 // expanded instructions dominate all their uses. 3217 auto GetInsertPoint = [this, &B]() { 3218 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3219 if (InsertBB != LoopVectorBody && 3220 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3221 return LoopVectorBody->getTerminator(); 3222 return &*B.GetInsertPoint(); 3223 }; 3224 switch (ID.getKind()) { 3225 case InductionDescriptor::IK_IntInduction: { 3226 assert(Index->getType() == StartValue->getType() && 3227 "Index type does not match StartValue type"); 3228 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3229 return B.CreateSub(StartValue, Index); 3230 auto *Offset = CreateMul( 3231 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3232 return CreateAdd(StartValue, Offset); 3233 } 3234 case InductionDescriptor::IK_PtrInduction: { 3235 assert(isa<SCEVConstant>(Step) && 3236 "Expected constant step for pointer induction"); 3237 return B.CreateGEP( 3238 StartValue->getType()->getPointerElementType(), StartValue, 3239 CreateMul(Index, 3240 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3241 } 3242 case InductionDescriptor::IK_FpInduction: { 3243 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3244 auto InductionBinOp = ID.getInductionBinOp(); 3245 assert(InductionBinOp && 3246 (InductionBinOp->getOpcode() == Instruction::FAdd || 3247 InductionBinOp->getOpcode() == Instruction::FSub) && 3248 "Original bin op should be defined for FP induction"); 3249 3250 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3251 3252 // Floating point operations had to be 'fast' to enable the induction. 3253 FastMathFlags Flags; 3254 Flags.setFast(); 3255 3256 Value *MulExp = B.CreateFMul(StepValue, Index); 3257 if (isa<Instruction>(MulExp)) 3258 // We have to check, the MulExp may be a constant. 3259 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3260 3261 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3262 "induction"); 3263 if (isa<Instruction>(BOp)) 3264 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3265 3266 return BOp; 3267 } 3268 case InductionDescriptor::IK_NoInduction: 3269 return nullptr; 3270 } 3271 llvm_unreachable("invalid enum"); 3272 } 3273 3274 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3275 LoopScalarBody = OrigLoop->getHeader(); 3276 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3277 LoopExitBlock = OrigLoop->getExitBlock(); 3278 assert(LoopExitBlock && "Must have an exit block"); 3279 assert(LoopVectorPreHeader && "Invalid loop structure"); 3280 3281 LoopMiddleBlock = 3282 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3283 LI, nullptr, Twine(Prefix) + "middle.block"); 3284 LoopScalarPreHeader = 3285 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3286 nullptr, Twine(Prefix) + "scalar.ph"); 3287 // We intentionally don't let SplitBlock to update LoopInfo since 3288 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3289 // LoopVectorBody is explicitly added to the correct place few lines later. 3290 LoopVectorBody = 3291 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3292 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3293 3294 // Update dominator for loop exit. 3295 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3296 3297 // Create and register the new vector loop. 3298 Loop *Lp = LI->AllocateLoop(); 3299 Loop *ParentLoop = OrigLoop->getParentLoop(); 3300 3301 // Insert the new loop into the loop nest and register the new basic blocks 3302 // before calling any utilities such as SCEV that require valid LoopInfo. 3303 if (ParentLoop) { 3304 ParentLoop->addChildLoop(Lp); 3305 } else { 3306 LI->addTopLevelLoop(Lp); 3307 } 3308 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3309 return Lp; 3310 } 3311 3312 void InnerLoopVectorizer::createInductionResumeValues( 3313 Loop *L, Value *VectorTripCount, 3314 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3315 assert(VectorTripCount && L && "Expected valid arguments"); 3316 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3317 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3318 "Inconsistent information about additional bypass."); 3319 // We are going to resume the execution of the scalar loop. 3320 // Go over all of the induction variables that we found and fix the 3321 // PHIs that are left in the scalar version of the loop. 3322 // The starting values of PHI nodes depend on the counter of the last 3323 // iteration in the vectorized loop. 3324 // If we come from a bypass edge then we need to start from the original 3325 // start value. 3326 for (auto &InductionEntry : Legal->getInductionVars()) { 3327 PHINode *OrigPhi = InductionEntry.first; 3328 InductionDescriptor II = InductionEntry.second; 3329 3330 // Create phi nodes to merge from the backedge-taken check block. 3331 PHINode *BCResumeVal = 3332 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3333 LoopScalarPreHeader->getTerminator()); 3334 // Copy original phi DL over to the new one. 3335 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3336 Value *&EndValue = IVEndValues[OrigPhi]; 3337 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3338 if (OrigPhi == OldInduction) { 3339 // We know what the end value is. 3340 EndValue = VectorTripCount; 3341 } else { 3342 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3343 Type *StepType = II.getStep()->getType(); 3344 Instruction::CastOps CastOp = 3345 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3346 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3347 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3348 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3349 EndValue->setName("ind.end"); 3350 3351 // Compute the end value for the additional bypass (if applicable). 3352 if (AdditionalBypass.first) { 3353 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3354 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3355 StepType, true); 3356 CRD = 3357 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3358 EndValueFromAdditionalBypass = 3359 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3360 EndValueFromAdditionalBypass->setName("ind.end"); 3361 } 3362 } 3363 // The new PHI merges the original incoming value, in case of a bypass, 3364 // or the value at the end of the vectorized loop. 3365 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3366 3367 // Fix the scalar body counter (PHI node). 3368 // The old induction's phi node in the scalar body needs the truncated 3369 // value. 3370 for (BasicBlock *BB : LoopBypassBlocks) 3371 BCResumeVal->addIncoming(II.getStartValue(), BB); 3372 3373 if (AdditionalBypass.first) 3374 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3375 EndValueFromAdditionalBypass); 3376 3377 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3378 } 3379 } 3380 3381 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3382 MDNode *OrigLoopID) { 3383 assert(L && "Expected valid loop."); 3384 3385 // The trip counts should be cached by now. 3386 Value *Count = getOrCreateTripCount(L); 3387 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3388 3389 // We need the OrigLoop (scalar loop part) latch terminator to help 3390 // produce correct debug info for the middle block BB instructions. 3391 // The legality check stage guarantees that the loop will have a single 3392 // latch. 3393 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3394 "Scalar loop latch terminator isn't a branch"); 3395 BranchInst *ScalarLatchBr = 3396 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3397 3398 // Add a check in the middle block to see if we have completed 3399 // all of the iterations in the first vector loop. 3400 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3401 // If tail is to be folded, we know we don't need to run the remainder. 3402 Value *CmpN = Builder.getTrue(); 3403 if (!Cost->foldTailByMasking()) { 3404 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3405 VectorTripCount, "cmp.n", 3406 LoopMiddleBlock->getTerminator()); 3407 3408 // Here we use the same DebugLoc as the scalar loop latch branch instead 3409 // of the corresponding compare because they may have ended up with 3410 // different line numbers and we want to avoid awkward line stepping while 3411 // debugging. Eg. if the compare has got a line number inside the loop. 3412 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3413 } 3414 3415 BranchInst *BrInst = 3416 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3417 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3418 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3419 3420 // Get ready to start creating new instructions into the vectorized body. 3421 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3422 "Inconsistent vector loop preheader"); 3423 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3424 3425 Optional<MDNode *> VectorizedLoopID = 3426 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3427 LLVMLoopVectorizeFollowupVectorized}); 3428 if (VectorizedLoopID.hasValue()) { 3429 L->setLoopID(VectorizedLoopID.getValue()); 3430 3431 // Do not setAlreadyVectorized if loop attributes have been defined 3432 // explicitly. 3433 return LoopVectorPreHeader; 3434 } 3435 3436 // Keep all loop hints from the original loop on the vector loop (we'll 3437 // replace the vectorizer-specific hints below). 3438 if (MDNode *LID = OrigLoop->getLoopID()) 3439 L->setLoopID(LID); 3440 3441 LoopVectorizeHints Hints(L, true, *ORE); 3442 Hints.setAlreadyVectorized(); 3443 3444 #ifdef EXPENSIVE_CHECKS 3445 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3446 LI->verify(*DT); 3447 #endif 3448 3449 return LoopVectorPreHeader; 3450 } 3451 3452 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3453 /* 3454 In this function we generate a new loop. The new loop will contain 3455 the vectorized instructions while the old loop will continue to run the 3456 scalar remainder. 3457 3458 [ ] <-- loop iteration number check. 3459 / | 3460 / v 3461 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3462 | / | 3463 | / v 3464 || [ ] <-- vector pre header. 3465 |/ | 3466 | v 3467 | [ ] \ 3468 | [ ]_| <-- vector loop. 3469 | | 3470 | v 3471 | -[ ] <--- middle-block. 3472 | / | 3473 | / v 3474 -|- >[ ] <--- new preheader. 3475 | | 3476 | v 3477 | [ ] \ 3478 | [ ]_| <-- old scalar loop to handle remainder. 3479 \ | 3480 \ v 3481 >[ ] <-- exit block. 3482 ... 3483 */ 3484 3485 // Get the metadata of the original loop before it gets modified. 3486 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3487 3488 // Create an empty vector loop, and prepare basic blocks for the runtime 3489 // checks. 3490 Loop *Lp = createVectorLoopSkeleton(""); 3491 3492 // Now, compare the new count to zero. If it is zero skip the vector loop and 3493 // jump to the scalar loop. This check also covers the case where the 3494 // backedge-taken count is uint##_max: adding one to it will overflow leading 3495 // to an incorrect trip count of zero. In this (rare) case we will also jump 3496 // to the scalar loop. 3497 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3498 3499 // Generate the code to check any assumptions that we've made for SCEV 3500 // expressions. 3501 emitSCEVChecks(Lp, LoopScalarPreHeader); 3502 3503 // Generate the code that checks in runtime if arrays overlap. We put the 3504 // checks into a separate block to make the more common case of few elements 3505 // faster. 3506 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3507 3508 // Some loops have a single integer induction variable, while other loops 3509 // don't. One example is c++ iterators that often have multiple pointer 3510 // induction variables. In the code below we also support a case where we 3511 // don't have a single induction variable. 3512 // 3513 // We try to obtain an induction variable from the original loop as hard 3514 // as possible. However if we don't find one that: 3515 // - is an integer 3516 // - counts from zero, stepping by one 3517 // - is the size of the widest induction variable type 3518 // then we create a new one. 3519 OldInduction = Legal->getPrimaryInduction(); 3520 Type *IdxTy = Legal->getWidestInductionType(); 3521 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3522 // The loop step is equal to the vectorization factor (num of SIMD elements) 3523 // times the unroll factor (num of SIMD instructions). 3524 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3525 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3526 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3527 Induction = 3528 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3529 getDebugLocFromInstOrOperands(OldInduction)); 3530 3531 // Emit phis for the new starting index of the scalar loop. 3532 createInductionResumeValues(Lp, CountRoundDown); 3533 3534 return completeLoopSkeleton(Lp, OrigLoopID); 3535 } 3536 3537 // Fix up external users of the induction variable. At this point, we are 3538 // in LCSSA form, with all external PHIs that use the IV having one input value, 3539 // coming from the remainder loop. We need those PHIs to also have a correct 3540 // value for the IV when arriving directly from the middle block. 3541 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3542 const InductionDescriptor &II, 3543 Value *CountRoundDown, Value *EndValue, 3544 BasicBlock *MiddleBlock) { 3545 // There are two kinds of external IV usages - those that use the value 3546 // computed in the last iteration (the PHI) and those that use the penultimate 3547 // value (the value that feeds into the phi from the loop latch). 3548 // We allow both, but they, obviously, have different values. 3549 3550 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3551 3552 DenseMap<Value *, Value *> MissingVals; 3553 3554 // An external user of the last iteration's value should see the value that 3555 // the remainder loop uses to initialize its own IV. 3556 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3557 for (User *U : PostInc->users()) { 3558 Instruction *UI = cast<Instruction>(U); 3559 if (!OrigLoop->contains(UI)) { 3560 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3561 MissingVals[UI] = EndValue; 3562 } 3563 } 3564 3565 // An external user of the penultimate value need to see EndValue - Step. 3566 // The simplest way to get this is to recompute it from the constituent SCEVs, 3567 // that is Start + (Step * (CRD - 1)). 3568 for (User *U : OrigPhi->users()) { 3569 auto *UI = cast<Instruction>(U); 3570 if (!OrigLoop->contains(UI)) { 3571 const DataLayout &DL = 3572 OrigLoop->getHeader()->getModule()->getDataLayout(); 3573 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3574 3575 IRBuilder<> B(MiddleBlock->getTerminator()); 3576 Value *CountMinusOne = B.CreateSub( 3577 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3578 Value *CMO = 3579 !II.getStep()->getType()->isIntegerTy() 3580 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3581 II.getStep()->getType()) 3582 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3583 CMO->setName("cast.cmo"); 3584 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3585 Escape->setName("ind.escape"); 3586 MissingVals[UI] = Escape; 3587 } 3588 } 3589 3590 for (auto &I : MissingVals) { 3591 PHINode *PHI = cast<PHINode>(I.first); 3592 // One corner case we have to handle is two IVs "chasing" each-other, 3593 // that is %IV2 = phi [...], [ %IV1, %latch ] 3594 // In this case, if IV1 has an external use, we need to avoid adding both 3595 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3596 // don't already have an incoming value for the middle block. 3597 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3598 PHI->addIncoming(I.second, MiddleBlock); 3599 } 3600 } 3601 3602 namespace { 3603 3604 struct CSEDenseMapInfo { 3605 static bool canHandle(const Instruction *I) { 3606 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3607 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3608 } 3609 3610 static inline Instruction *getEmptyKey() { 3611 return DenseMapInfo<Instruction *>::getEmptyKey(); 3612 } 3613 3614 static inline Instruction *getTombstoneKey() { 3615 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3616 } 3617 3618 static unsigned getHashValue(const Instruction *I) { 3619 assert(canHandle(I) && "Unknown instruction!"); 3620 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3621 I->value_op_end())); 3622 } 3623 3624 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3625 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3626 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3627 return LHS == RHS; 3628 return LHS->isIdenticalTo(RHS); 3629 } 3630 }; 3631 3632 } // end anonymous namespace 3633 3634 ///Perform cse of induction variable instructions. 3635 static void cse(BasicBlock *BB) { 3636 // Perform simple cse. 3637 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3638 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3639 Instruction *In = &*I++; 3640 3641 if (!CSEDenseMapInfo::canHandle(In)) 3642 continue; 3643 3644 // Check if we can replace this instruction with any of the 3645 // visited instructions. 3646 if (Instruction *V = CSEMap.lookup(In)) { 3647 In->replaceAllUsesWith(V); 3648 In->eraseFromParent(); 3649 continue; 3650 } 3651 3652 CSEMap[In] = In; 3653 } 3654 } 3655 3656 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3657 ElementCount VF, 3658 bool &NeedToScalarize) { 3659 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3660 Function *F = CI->getCalledFunction(); 3661 Type *ScalarRetTy = CI->getType(); 3662 SmallVector<Type *, 4> Tys, ScalarTys; 3663 for (auto &ArgOp : CI->arg_operands()) 3664 ScalarTys.push_back(ArgOp->getType()); 3665 3666 // Estimate cost of scalarized vector call. The source operands are assumed 3667 // to be vectors, so we need to extract individual elements from there, 3668 // execute VF scalar calls, and then gather the result into the vector return 3669 // value. 3670 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3671 TTI::TCK_RecipThroughput); 3672 if (VF.isScalar()) 3673 return ScalarCallCost; 3674 3675 // Compute corresponding vector type for return value and arguments. 3676 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3677 for (Type *ScalarTy : ScalarTys) 3678 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3679 3680 // Compute costs of unpacking argument values for the scalar calls and 3681 // packing the return values to a vector. 3682 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3683 3684 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3685 3686 // If we can't emit a vector call for this function, then the currently found 3687 // cost is the cost we need to return. 3688 NeedToScalarize = true; 3689 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3690 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3691 3692 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3693 return Cost; 3694 3695 // If the corresponding vector cost is cheaper, return its cost. 3696 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3697 TTI::TCK_RecipThroughput); 3698 if (VectorCallCost < Cost) { 3699 NeedToScalarize = false; 3700 return VectorCallCost; 3701 } 3702 return Cost; 3703 } 3704 3705 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3706 ElementCount VF) { 3707 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3708 assert(ID && "Expected intrinsic call!"); 3709 3710 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3711 return TTI.getIntrinsicInstrCost(CostAttrs, 3712 TargetTransformInfo::TCK_RecipThroughput); 3713 } 3714 3715 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3716 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3717 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3718 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3719 } 3720 3721 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3722 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3723 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3724 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3725 } 3726 3727 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3728 // For every instruction `I` in MinBWs, truncate the operands, create a 3729 // truncated version of `I` and reextend its result. InstCombine runs 3730 // later and will remove any ext/trunc pairs. 3731 SmallPtrSet<Value *, 4> Erased; 3732 for (const auto &KV : Cost->getMinimalBitwidths()) { 3733 // If the value wasn't vectorized, we must maintain the original scalar 3734 // type. The absence of the value from VectorLoopValueMap indicates that it 3735 // wasn't vectorized. 3736 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3737 continue; 3738 for (unsigned Part = 0; Part < UF; ++Part) { 3739 Value *I = getOrCreateVectorValue(KV.first, Part); 3740 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3741 continue; 3742 Type *OriginalTy = I->getType(); 3743 Type *ScalarTruncatedTy = 3744 IntegerType::get(OriginalTy->getContext(), KV.second); 3745 auto *TruncatedTy = FixedVectorType::get( 3746 ScalarTruncatedTy, 3747 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3748 if (TruncatedTy == OriginalTy) 3749 continue; 3750 3751 IRBuilder<> B(cast<Instruction>(I)); 3752 auto ShrinkOperand = [&](Value *V) -> Value * { 3753 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3754 if (ZI->getSrcTy() == TruncatedTy) 3755 return ZI->getOperand(0); 3756 return B.CreateZExtOrTrunc(V, TruncatedTy); 3757 }; 3758 3759 // The actual instruction modification depends on the instruction type, 3760 // unfortunately. 3761 Value *NewI = nullptr; 3762 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3763 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3764 ShrinkOperand(BO->getOperand(1))); 3765 3766 // Any wrapping introduced by shrinking this operation shouldn't be 3767 // considered undefined behavior. So, we can't unconditionally copy 3768 // arithmetic wrapping flags to NewI. 3769 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3770 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3771 NewI = 3772 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3773 ShrinkOperand(CI->getOperand(1))); 3774 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3775 NewI = B.CreateSelect(SI->getCondition(), 3776 ShrinkOperand(SI->getTrueValue()), 3777 ShrinkOperand(SI->getFalseValue())); 3778 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3779 switch (CI->getOpcode()) { 3780 default: 3781 llvm_unreachable("Unhandled cast!"); 3782 case Instruction::Trunc: 3783 NewI = ShrinkOperand(CI->getOperand(0)); 3784 break; 3785 case Instruction::SExt: 3786 NewI = B.CreateSExtOrTrunc( 3787 CI->getOperand(0), 3788 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3789 break; 3790 case Instruction::ZExt: 3791 NewI = B.CreateZExtOrTrunc( 3792 CI->getOperand(0), 3793 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3794 break; 3795 } 3796 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3797 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3798 ->getNumElements(); 3799 auto *O0 = B.CreateZExtOrTrunc( 3800 SI->getOperand(0), 3801 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3802 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3803 ->getNumElements(); 3804 auto *O1 = B.CreateZExtOrTrunc( 3805 SI->getOperand(1), 3806 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3807 3808 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3809 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3810 // Don't do anything with the operands, just extend the result. 3811 continue; 3812 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3813 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3814 ->getNumElements(); 3815 auto *O0 = B.CreateZExtOrTrunc( 3816 IE->getOperand(0), 3817 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3818 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3819 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3820 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3821 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3822 ->getNumElements(); 3823 auto *O0 = B.CreateZExtOrTrunc( 3824 EE->getOperand(0), 3825 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3826 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3827 } else { 3828 // If we don't know what to do, be conservative and don't do anything. 3829 continue; 3830 } 3831 3832 // Lastly, extend the result. 3833 NewI->takeName(cast<Instruction>(I)); 3834 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3835 I->replaceAllUsesWith(Res); 3836 cast<Instruction>(I)->eraseFromParent(); 3837 Erased.insert(I); 3838 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3839 } 3840 } 3841 3842 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3843 for (const auto &KV : Cost->getMinimalBitwidths()) { 3844 // If the value wasn't vectorized, we must maintain the original scalar 3845 // type. The absence of the value from VectorLoopValueMap indicates that it 3846 // wasn't vectorized. 3847 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3848 continue; 3849 for (unsigned Part = 0; Part < UF; ++Part) { 3850 Value *I = getOrCreateVectorValue(KV.first, Part); 3851 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3852 if (Inst && Inst->use_empty()) { 3853 Value *NewI = Inst->getOperand(0); 3854 Inst->eraseFromParent(); 3855 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3856 } 3857 } 3858 } 3859 } 3860 3861 void InnerLoopVectorizer::fixVectorizedLoop() { 3862 // Insert truncates and extends for any truncated instructions as hints to 3863 // InstCombine. 3864 if (VF.isVector()) 3865 truncateToMinimalBitwidths(); 3866 3867 // Fix widened non-induction PHIs by setting up the PHI operands. 3868 if (OrigPHIsToFix.size()) { 3869 assert(EnableVPlanNativePath && 3870 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3871 fixNonInductionPHIs(); 3872 } 3873 3874 // At this point every instruction in the original loop is widened to a 3875 // vector form. Now we need to fix the recurrences in the loop. These PHI 3876 // nodes are currently empty because we did not want to introduce cycles. 3877 // This is the second stage of vectorizing recurrences. 3878 fixCrossIterationPHIs(); 3879 3880 // Forget the original basic block. 3881 PSE.getSE()->forgetLoop(OrigLoop); 3882 3883 // Fix-up external users of the induction variables. 3884 for (auto &Entry : Legal->getInductionVars()) 3885 fixupIVUsers(Entry.first, Entry.second, 3886 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3887 IVEndValues[Entry.first], LoopMiddleBlock); 3888 3889 fixLCSSAPHIs(); 3890 for (Instruction *PI : PredicatedInstructions) 3891 sinkScalarOperands(&*PI); 3892 3893 // Remove redundant induction instructions. 3894 cse(LoopVectorBody); 3895 3896 // Set/update profile weights for the vector and remainder loops as original 3897 // loop iterations are now distributed among them. Note that original loop 3898 // represented by LoopScalarBody becomes remainder loop after vectorization. 3899 // 3900 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3901 // end up getting slightly roughened result but that should be OK since 3902 // profile is not inherently precise anyway. Note also possible bypass of 3903 // vector code caused by legality checks is ignored, assigning all the weight 3904 // to the vector loop, optimistically. 3905 assert(!VF.isScalable() && 3906 "cannot use scalable ElementCount to determine unroll factor"); 3907 setProfileInfoAfterUnrolling( 3908 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3909 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3910 } 3911 3912 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3913 // In order to support recurrences we need to be able to vectorize Phi nodes. 3914 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3915 // stage #2: We now need to fix the recurrences by adding incoming edges to 3916 // the currently empty PHI nodes. At this point every instruction in the 3917 // original loop is widened to a vector form so we can use them to construct 3918 // the incoming edges. 3919 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3920 // Handle first-order recurrences and reductions that need to be fixed. 3921 if (Legal->isFirstOrderRecurrence(&Phi)) 3922 fixFirstOrderRecurrence(&Phi); 3923 else if (Legal->isReductionVariable(&Phi)) 3924 fixReduction(&Phi); 3925 } 3926 } 3927 3928 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3929 // This is the second phase of vectorizing first-order recurrences. An 3930 // overview of the transformation is described below. Suppose we have the 3931 // following loop. 3932 // 3933 // for (int i = 0; i < n; ++i) 3934 // b[i] = a[i] - a[i - 1]; 3935 // 3936 // There is a first-order recurrence on "a". For this loop, the shorthand 3937 // scalar IR looks like: 3938 // 3939 // scalar.ph: 3940 // s_init = a[-1] 3941 // br scalar.body 3942 // 3943 // scalar.body: 3944 // i = phi [0, scalar.ph], [i+1, scalar.body] 3945 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3946 // s2 = a[i] 3947 // b[i] = s2 - s1 3948 // br cond, scalar.body, ... 3949 // 3950 // In this example, s1 is a recurrence because it's value depends on the 3951 // previous iteration. In the first phase of vectorization, we created a 3952 // temporary value for s1. We now complete the vectorization and produce the 3953 // shorthand vector IR shown below (for VF = 4, UF = 1). 3954 // 3955 // vector.ph: 3956 // v_init = vector(..., ..., ..., a[-1]) 3957 // br vector.body 3958 // 3959 // vector.body 3960 // i = phi [0, vector.ph], [i+4, vector.body] 3961 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3962 // v2 = a[i, i+1, i+2, i+3]; 3963 // v3 = vector(v1(3), v2(0, 1, 2)) 3964 // b[i, i+1, i+2, i+3] = v2 - v3 3965 // br cond, vector.body, middle.block 3966 // 3967 // middle.block: 3968 // x = v2(3) 3969 // br scalar.ph 3970 // 3971 // scalar.ph: 3972 // s_init = phi [x, middle.block], [a[-1], otherwise] 3973 // br scalar.body 3974 // 3975 // After execution completes the vector loop, we extract the next value of 3976 // the recurrence (x) to use as the initial value in the scalar loop. 3977 3978 // Get the original loop preheader and single loop latch. 3979 auto *Preheader = OrigLoop->getLoopPreheader(); 3980 auto *Latch = OrigLoop->getLoopLatch(); 3981 3982 // Get the initial and previous values of the scalar recurrence. 3983 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3984 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3985 3986 // Create a vector from the initial value. 3987 auto *VectorInit = ScalarInit; 3988 if (VF.isVector()) { 3989 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3990 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3991 VectorInit = Builder.CreateInsertElement( 3992 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3993 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3994 } 3995 3996 // We constructed a temporary phi node in the first phase of vectorization. 3997 // This phi node will eventually be deleted. 3998 Builder.SetInsertPoint( 3999 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4000 4001 // Create a phi node for the new recurrence. The current value will either be 4002 // the initial value inserted into a vector or loop-varying vector value. 4003 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4004 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4005 4006 // Get the vectorized previous value of the last part UF - 1. It appears last 4007 // among all unrolled iterations, due to the order of their construction. 4008 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4009 4010 // Find and set the insertion point after the previous value if it is an 4011 // instruction. 4012 BasicBlock::iterator InsertPt; 4013 // Note that the previous value may have been constant-folded so it is not 4014 // guaranteed to be an instruction in the vector loop. 4015 // FIXME: Loop invariant values do not form recurrences. We should deal with 4016 // them earlier. 4017 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4018 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4019 else { 4020 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4021 if (isa<PHINode>(PreviousLastPart)) 4022 // If the previous value is a phi node, we should insert after all the phi 4023 // nodes in the block containing the PHI to avoid breaking basic block 4024 // verification. Note that the basic block may be different to 4025 // LoopVectorBody, in case we predicate the loop. 4026 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4027 else 4028 InsertPt = ++PreviousInst->getIterator(); 4029 } 4030 Builder.SetInsertPoint(&*InsertPt); 4031 4032 // We will construct a vector for the recurrence by combining the values for 4033 // the current and previous iterations. This is the required shuffle mask. 4034 assert(!VF.isScalable()); 4035 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4036 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4037 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4038 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4039 4040 // The vector from which to take the initial value for the current iteration 4041 // (actual or unrolled). Initially, this is the vector phi node. 4042 Value *Incoming = VecPhi; 4043 4044 // Shuffle the current and previous vector and update the vector parts. 4045 for (unsigned Part = 0; Part < UF; ++Part) { 4046 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4047 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4048 auto *Shuffle = 4049 VF.isVector() 4050 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4051 : Incoming; 4052 PhiPart->replaceAllUsesWith(Shuffle); 4053 cast<Instruction>(PhiPart)->eraseFromParent(); 4054 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4055 Incoming = PreviousPart; 4056 } 4057 4058 // Fix the latch value of the new recurrence in the vector loop. 4059 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4060 4061 // Extract the last vector element in the middle block. This will be the 4062 // initial value for the recurrence when jumping to the scalar loop. 4063 auto *ExtractForScalar = Incoming; 4064 if (VF.isVector()) { 4065 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4066 ExtractForScalar = Builder.CreateExtractElement( 4067 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4068 "vector.recur.extract"); 4069 } 4070 // Extract the second last element in the middle block if the 4071 // Phi is used outside the loop. We need to extract the phi itself 4072 // and not the last element (the phi update in the current iteration). This 4073 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4074 // when the scalar loop is not run at all. 4075 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4076 if (VF.isVector()) 4077 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4078 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4079 "vector.recur.extract.for.phi"); 4080 // When loop is unrolled without vectorizing, initialize 4081 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4082 // `Incoming`. This is analogous to the vectorized case above: extracting the 4083 // second last element when VF > 1. 4084 else if (UF > 1) 4085 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4086 4087 // Fix the initial value of the original recurrence in the scalar loop. 4088 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4089 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4090 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4091 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4092 Start->addIncoming(Incoming, BB); 4093 } 4094 4095 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4096 Phi->setName("scalar.recur"); 4097 4098 // Finally, fix users of the recurrence outside the loop. The users will need 4099 // either the last value of the scalar recurrence or the last value of the 4100 // vector recurrence we extracted in the middle block. Since the loop is in 4101 // LCSSA form, we just need to find all the phi nodes for the original scalar 4102 // recurrence in the exit block, and then add an edge for the middle block. 4103 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4104 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4105 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4106 } 4107 } 4108 } 4109 4110 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4111 Constant *Zero = Builder.getInt32(0); 4112 4113 // Get it's reduction variable descriptor. 4114 assert(Legal->isReductionVariable(Phi) && 4115 "Unable to find the reduction variable"); 4116 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4117 4118 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4119 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4120 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4121 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 4122 RdxDesc.getMinMaxRecurrenceKind(); 4123 setDebugLocFromInst(Builder, ReductionStartValue); 4124 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4125 4126 // We need to generate a reduction vector from the incoming scalar. 4127 // To do so, we need to generate the 'identity' vector and override 4128 // one of the elements with the incoming scalar reduction. We need 4129 // to do it in the vector-loop preheader. 4130 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4131 4132 // This is the vector-clone of the value that leaves the loop. 4133 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4134 4135 // Find the reduction identity variable. Zero for addition, or, xor, 4136 // one for multiplication, -1 for And. 4137 Value *Identity; 4138 Value *VectorStart; 4139 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 4140 RK == RecurrenceDescriptor::RK_FloatMinMax) { 4141 // MinMax reduction have the start value as their identify. 4142 if (VF.isScalar() || IsInLoopReductionPhi) { 4143 VectorStart = Identity = ReductionStartValue; 4144 } else { 4145 VectorStart = Identity = 4146 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4147 } 4148 } else { 4149 // Handle other reduction kinds: 4150 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4151 RK, MinMaxKind, VecTy->getScalarType()); 4152 if (VF.isScalar() || IsInLoopReductionPhi) { 4153 Identity = Iden; 4154 // This vector is the Identity vector where the first element is the 4155 // incoming scalar reduction. 4156 VectorStart = ReductionStartValue; 4157 } else { 4158 Identity = ConstantVector::getSplat(VF, Iden); 4159 4160 // This vector is the Identity vector where the first element is the 4161 // incoming scalar reduction. 4162 VectorStart = 4163 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4164 } 4165 } 4166 4167 // Wrap flags are in general invalid after vectorization, clear them. 4168 clearReductionWrapFlags(RdxDesc); 4169 4170 // Fix the vector-loop phi. 4171 4172 // Reductions do not have to start at zero. They can start with 4173 // any loop invariant values. 4174 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4175 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4176 4177 for (unsigned Part = 0; Part < UF; ++Part) { 4178 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4179 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4180 // Make sure to add the reduction start value only to the 4181 // first unroll part. 4182 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4183 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4184 cast<PHINode>(VecRdxPhi) 4185 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4186 } 4187 4188 // Before each round, move the insertion point right between 4189 // the PHIs and the values we are going to write. 4190 // This allows us to write both PHINodes and the extractelement 4191 // instructions. 4192 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4193 4194 setDebugLocFromInst(Builder, LoopExitInst); 4195 4196 // If tail is folded by masking, the vector value to leave the loop should be 4197 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4198 // instead of the former. For an inloop reduction the reduction will already 4199 // be predicated, and does not need to be handled here. 4200 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4201 for (unsigned Part = 0; Part < UF; ++Part) { 4202 Value *VecLoopExitInst = 4203 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4204 Value *Sel = nullptr; 4205 for (User *U : VecLoopExitInst->users()) { 4206 if (isa<SelectInst>(U)) { 4207 assert(!Sel && "Reduction exit feeding two selects"); 4208 Sel = U; 4209 } else 4210 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4211 } 4212 assert(Sel && "Reduction exit feeds no select"); 4213 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4214 4215 // If the target can create a predicated operator for the reduction at no 4216 // extra cost in the loop (for example a predicated vadd), it can be 4217 // cheaper for the select to remain in the loop than be sunk out of it, 4218 // and so use the select value for the phi instead of the old 4219 // LoopExitValue. 4220 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4221 if (PreferPredicatedReductionSelect || 4222 TTI->preferPredicatedReductionSelect( 4223 RdxDesc.getRecurrenceBinOp(), Phi->getType(), 4224 TargetTransformInfo::ReductionFlags())) { 4225 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4226 VecRdxPhi->setIncomingValueForBlock( 4227 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4228 } 4229 } 4230 } 4231 4232 // If the vector reduction can be performed in a smaller type, we truncate 4233 // then extend the loop exit value to enable InstCombine to evaluate the 4234 // entire expression in the smaller type. 4235 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4236 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4237 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4238 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4239 Builder.SetInsertPoint( 4240 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4241 VectorParts RdxParts(UF); 4242 for (unsigned Part = 0; Part < UF; ++Part) { 4243 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4244 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4245 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4246 : Builder.CreateZExt(Trunc, VecTy); 4247 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4248 UI != RdxParts[Part]->user_end();) 4249 if (*UI != Trunc) { 4250 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4251 RdxParts[Part] = Extnd; 4252 } else { 4253 ++UI; 4254 } 4255 } 4256 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4257 for (unsigned Part = 0; Part < UF; ++Part) { 4258 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4259 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4260 } 4261 } 4262 4263 // Reduce all of the unrolled parts into a single vector. 4264 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4265 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4266 4267 // The middle block terminator has already been assigned a DebugLoc here (the 4268 // OrigLoop's single latch terminator). We want the whole middle block to 4269 // appear to execute on this line because: (a) it is all compiler generated, 4270 // (b) these instructions are always executed after evaluating the latch 4271 // conditional branch, and (c) other passes may add new predecessors which 4272 // terminate on this line. This is the easiest way to ensure we don't 4273 // accidentally cause an extra step back into the loop while debugging. 4274 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4275 for (unsigned Part = 1; Part < UF; ++Part) { 4276 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4277 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4278 // Floating point operations had to be 'fast' to enable the reduction. 4279 ReducedPartRdx = addFastMathFlag( 4280 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4281 ReducedPartRdx, "bin.rdx"), 4282 RdxDesc.getFastMathFlags()); 4283 else 4284 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4285 RdxPart); 4286 } 4287 4288 // Create the reduction after the loop. Note that inloop reductions create the 4289 // target reduction in the loop using a Reduction recipe. 4290 if (VF.isVector() && !IsInLoopReductionPhi) { 4291 bool NoNaN = Legal->hasFunNoNaNAttr(); 4292 ReducedPartRdx = 4293 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4294 // If the reduction can be performed in a smaller type, we need to extend 4295 // the reduction to the wider type before we branch to the original loop. 4296 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4297 ReducedPartRdx = 4298 RdxDesc.isSigned() 4299 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4300 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4301 } 4302 4303 // Create a phi node that merges control-flow from the backedge-taken check 4304 // block and the middle block. 4305 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4306 LoopScalarPreHeader->getTerminator()); 4307 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4308 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4309 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4310 4311 // Now, we need to fix the users of the reduction variable 4312 // inside and outside of the scalar remainder loop. 4313 // We know that the loop is in LCSSA form. We need to update the 4314 // PHI nodes in the exit blocks. 4315 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4316 // All PHINodes need to have a single entry edge, or two if 4317 // we already fixed them. 4318 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4319 4320 // We found a reduction value exit-PHI. Update it with the 4321 // incoming bypass edge. 4322 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4323 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4324 } // end of the LCSSA phi scan. 4325 4326 // Fix the scalar loop reduction variable with the incoming reduction sum 4327 // from the vector body and from the backedge value. 4328 int IncomingEdgeBlockIdx = 4329 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4330 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4331 // Pick the other block. 4332 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4333 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4334 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4335 } 4336 4337 void InnerLoopVectorizer::clearReductionWrapFlags( 4338 RecurrenceDescriptor &RdxDesc) { 4339 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4340 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4341 RK != RecurrenceDescriptor::RK_IntegerMult) 4342 return; 4343 4344 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4345 assert(LoopExitInstr && "null loop exit instruction"); 4346 SmallVector<Instruction *, 8> Worklist; 4347 SmallPtrSet<Instruction *, 8> Visited; 4348 Worklist.push_back(LoopExitInstr); 4349 Visited.insert(LoopExitInstr); 4350 4351 while (!Worklist.empty()) { 4352 Instruction *Cur = Worklist.pop_back_val(); 4353 if (isa<OverflowingBinaryOperator>(Cur)) 4354 for (unsigned Part = 0; Part < UF; ++Part) { 4355 Value *V = getOrCreateVectorValue(Cur, Part); 4356 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4357 } 4358 4359 for (User *U : Cur->users()) { 4360 Instruction *UI = cast<Instruction>(U); 4361 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4362 Visited.insert(UI).second) 4363 Worklist.push_back(UI); 4364 } 4365 } 4366 } 4367 4368 void InnerLoopVectorizer::fixLCSSAPHIs() { 4369 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4370 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4371 if (LCSSAPhi.getNumIncomingValues() == 1) { 4372 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4373 // Non-instruction incoming values will have only one value. 4374 unsigned LastLane = 0; 4375 if (isa<Instruction>(IncomingValue)) 4376 LastLane = Cost->isUniformAfterVectorization( 4377 cast<Instruction>(IncomingValue), VF) 4378 ? 0 4379 : VF.getKnownMinValue() - 1; 4380 // Can be a loop invariant incoming value or the last scalar value to be 4381 // extracted from the vectorized loop. 4382 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4383 Value *lastIncomingValue = 4384 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4385 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4386 } 4387 } 4388 } 4389 4390 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4391 // The basic block and loop containing the predicated instruction. 4392 auto *PredBB = PredInst->getParent(); 4393 auto *VectorLoop = LI->getLoopFor(PredBB); 4394 4395 // Initialize a worklist with the operands of the predicated instruction. 4396 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4397 4398 // Holds instructions that we need to analyze again. An instruction may be 4399 // reanalyzed if we don't yet know if we can sink it or not. 4400 SmallVector<Instruction *, 8> InstsToReanalyze; 4401 4402 // Returns true if a given use occurs in the predicated block. Phi nodes use 4403 // their operands in their corresponding predecessor blocks. 4404 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4405 auto *I = cast<Instruction>(U.getUser()); 4406 BasicBlock *BB = I->getParent(); 4407 if (auto *Phi = dyn_cast<PHINode>(I)) 4408 BB = Phi->getIncomingBlock( 4409 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4410 return BB == PredBB; 4411 }; 4412 4413 // Iteratively sink the scalarized operands of the predicated instruction 4414 // into the block we created for it. When an instruction is sunk, it's 4415 // operands are then added to the worklist. The algorithm ends after one pass 4416 // through the worklist doesn't sink a single instruction. 4417 bool Changed; 4418 do { 4419 // Add the instructions that need to be reanalyzed to the worklist, and 4420 // reset the changed indicator. 4421 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4422 InstsToReanalyze.clear(); 4423 Changed = false; 4424 4425 while (!Worklist.empty()) { 4426 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4427 4428 // We can't sink an instruction if it is a phi node, is already in the 4429 // predicated block, is not in the loop, or may have side effects. 4430 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4431 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4432 continue; 4433 4434 // It's legal to sink the instruction if all its uses occur in the 4435 // predicated block. Otherwise, there's nothing to do yet, and we may 4436 // need to reanalyze the instruction. 4437 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4438 InstsToReanalyze.push_back(I); 4439 continue; 4440 } 4441 4442 // Move the instruction to the beginning of the predicated block, and add 4443 // it's operands to the worklist. 4444 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4445 Worklist.insert(I->op_begin(), I->op_end()); 4446 4447 // The sinking may have enabled other instructions to be sunk, so we will 4448 // need to iterate. 4449 Changed = true; 4450 } 4451 } while (Changed); 4452 } 4453 4454 void InnerLoopVectorizer::fixNonInductionPHIs() { 4455 for (PHINode *OrigPhi : OrigPHIsToFix) { 4456 PHINode *NewPhi = 4457 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4458 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4459 4460 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4461 predecessors(OrigPhi->getParent())); 4462 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4463 predecessors(NewPhi->getParent())); 4464 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4465 "Scalar and Vector BB should have the same number of predecessors"); 4466 4467 // The insertion point in Builder may be invalidated by the time we get 4468 // here. Force the Builder insertion point to something valid so that we do 4469 // not run into issues during insertion point restore in 4470 // getOrCreateVectorValue calls below. 4471 Builder.SetInsertPoint(NewPhi); 4472 4473 // The predecessor order is preserved and we can rely on mapping between 4474 // scalar and vector block predecessors. 4475 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4476 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4477 4478 // When looking up the new scalar/vector values to fix up, use incoming 4479 // values from original phi. 4480 Value *ScIncV = 4481 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4482 4483 // Scalar incoming value may need a broadcast 4484 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4485 NewPhi->addIncoming(NewIncV, NewPredBB); 4486 } 4487 } 4488 } 4489 4490 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4491 VPUser &Operands, unsigned UF, 4492 ElementCount VF, bool IsPtrLoopInvariant, 4493 SmallBitVector &IsIndexLoopInvariant, 4494 VPTransformState &State) { 4495 // Construct a vector GEP by widening the operands of the scalar GEP as 4496 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4497 // results in a vector of pointers when at least one operand of the GEP 4498 // is vector-typed. Thus, to keep the representation compact, we only use 4499 // vector-typed operands for loop-varying values. 4500 4501 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4502 // If we are vectorizing, but the GEP has only loop-invariant operands, 4503 // the GEP we build (by only using vector-typed operands for 4504 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4505 // produce a vector of pointers, we need to either arbitrarily pick an 4506 // operand to broadcast, or broadcast a clone of the original GEP. 4507 // Here, we broadcast a clone of the original. 4508 // 4509 // TODO: If at some point we decide to scalarize instructions having 4510 // loop-invariant operands, this special case will no longer be 4511 // required. We would add the scalarization decision to 4512 // collectLoopScalars() and teach getVectorValue() to broadcast 4513 // the lane-zero scalar value. 4514 auto *Clone = Builder.Insert(GEP->clone()); 4515 for (unsigned Part = 0; Part < UF; ++Part) { 4516 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4517 State.set(VPDef, GEP, EntryPart, Part); 4518 addMetadata(EntryPart, GEP); 4519 } 4520 } else { 4521 // If the GEP has at least one loop-varying operand, we are sure to 4522 // produce a vector of pointers. But if we are only unrolling, we want 4523 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4524 // produce with the code below will be scalar (if VF == 1) or vector 4525 // (otherwise). Note that for the unroll-only case, we still maintain 4526 // values in the vector mapping with initVector, as we do for other 4527 // instructions. 4528 for (unsigned Part = 0; Part < UF; ++Part) { 4529 // The pointer operand of the new GEP. If it's loop-invariant, we 4530 // won't broadcast it. 4531 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4532 : State.get(Operands.getOperand(0), Part); 4533 4534 // Collect all the indices for the new GEP. If any index is 4535 // loop-invariant, we won't broadcast it. 4536 SmallVector<Value *, 4> Indices; 4537 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4538 VPValue *Operand = Operands.getOperand(I); 4539 if (IsIndexLoopInvariant[I - 1]) 4540 Indices.push_back(State.get(Operand, {0, 0})); 4541 else 4542 Indices.push_back(State.get(Operand, Part)); 4543 } 4544 4545 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4546 // but it should be a vector, otherwise. 4547 auto *NewGEP = 4548 GEP->isInBounds() 4549 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4550 Indices) 4551 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4552 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4553 "NewGEP is not a pointer vector"); 4554 State.set(VPDef, GEP, NewGEP, Part); 4555 addMetadata(NewGEP, GEP); 4556 } 4557 } 4558 } 4559 4560 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4561 ElementCount VF) { 4562 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4563 PHINode *P = cast<PHINode>(PN); 4564 if (EnableVPlanNativePath) { 4565 // Currently we enter here in the VPlan-native path for non-induction 4566 // PHIs where all control flow is uniform. We simply widen these PHIs. 4567 // Create a vector phi with no operands - the vector phi operands will be 4568 // set at the end of vector code generation. 4569 Type *VecTy = 4570 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4571 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4572 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4573 OrigPHIsToFix.push_back(P); 4574 4575 return; 4576 } 4577 4578 assert(PN->getParent() == OrigLoop->getHeader() && 4579 "Non-header phis should have been handled elsewhere"); 4580 4581 // In order to support recurrences we need to be able to vectorize Phi nodes. 4582 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4583 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4584 // this value when we vectorize all of the instructions that use the PHI. 4585 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4586 for (unsigned Part = 0; Part < UF; ++Part) { 4587 // This is phase one of vectorizing PHIs. 4588 bool ScalarPHI = 4589 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4590 Type *VecTy = 4591 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4592 Value *EntryPart = PHINode::Create( 4593 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4594 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4595 } 4596 return; 4597 } 4598 4599 setDebugLocFromInst(Builder, P); 4600 4601 // This PHINode must be an induction variable. 4602 // Make sure that we know about it. 4603 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4604 4605 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4606 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4607 4608 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4609 // which can be found from the original scalar operations. 4610 switch (II.getKind()) { 4611 case InductionDescriptor::IK_NoInduction: 4612 llvm_unreachable("Unknown induction"); 4613 case InductionDescriptor::IK_IntInduction: 4614 case InductionDescriptor::IK_FpInduction: 4615 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4616 case InductionDescriptor::IK_PtrInduction: { 4617 // Handle the pointer induction variable case. 4618 assert(P->getType()->isPointerTy() && "Unexpected type."); 4619 4620 if (Cost->isScalarAfterVectorization(P, VF)) { 4621 // This is the normalized GEP that starts counting at zero. 4622 Value *PtrInd = 4623 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4624 // Determine the number of scalars we need to generate for each unroll 4625 // iteration. If the instruction is uniform, we only need to generate the 4626 // first lane. Otherwise, we generate all VF values. 4627 unsigned Lanes = 4628 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4629 for (unsigned Part = 0; Part < UF; ++Part) { 4630 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4631 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4632 Lane + Part * VF.getKnownMinValue()); 4633 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4634 Value *SclrGep = 4635 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4636 SclrGep->setName("next.gep"); 4637 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4638 } 4639 } 4640 return; 4641 } 4642 assert(isa<SCEVConstant>(II.getStep()) && 4643 "Induction step not a SCEV constant!"); 4644 Type *PhiType = II.getStep()->getType(); 4645 4646 // Build a pointer phi 4647 Value *ScalarStartValue = II.getStartValue(); 4648 Type *ScStValueType = ScalarStartValue->getType(); 4649 PHINode *NewPointerPhi = 4650 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4651 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4652 4653 // A pointer induction, performed by using a gep 4654 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4655 Instruction *InductionLoc = LoopLatch->getTerminator(); 4656 const SCEV *ScalarStep = II.getStep(); 4657 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4658 Value *ScalarStepValue = 4659 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4660 Value *InductionGEP = GetElementPtrInst::Create( 4661 ScStValueType->getPointerElementType(), NewPointerPhi, 4662 Builder.CreateMul( 4663 ScalarStepValue, 4664 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4665 "ptr.ind", InductionLoc); 4666 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4667 4668 // Create UF many actual address geps that use the pointer 4669 // phi as base and a vectorized version of the step value 4670 // (<step*0, ..., step*N>) as offset. 4671 for (unsigned Part = 0; Part < UF; ++Part) { 4672 SmallVector<Constant *, 8> Indices; 4673 // Create a vector of consecutive numbers from zero to VF. 4674 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4675 Indices.push_back( 4676 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4677 Constant *StartOffset = ConstantVector::get(Indices); 4678 4679 Value *GEP = Builder.CreateGEP( 4680 ScStValueType->getPointerElementType(), NewPointerPhi, 4681 Builder.CreateMul( 4682 StartOffset, 4683 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4684 "vector.gep")); 4685 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4686 } 4687 } 4688 } 4689 } 4690 4691 /// A helper function for checking whether an integer division-related 4692 /// instruction may divide by zero (in which case it must be predicated if 4693 /// executed conditionally in the scalar code). 4694 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4695 /// Non-zero divisors that are non compile-time constants will not be 4696 /// converted into multiplication, so we will still end up scalarizing 4697 /// the division, but can do so w/o predication. 4698 static bool mayDivideByZero(Instruction &I) { 4699 assert((I.getOpcode() == Instruction::UDiv || 4700 I.getOpcode() == Instruction::SDiv || 4701 I.getOpcode() == Instruction::URem || 4702 I.getOpcode() == Instruction::SRem) && 4703 "Unexpected instruction"); 4704 Value *Divisor = I.getOperand(1); 4705 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4706 return !CInt || CInt->isZero(); 4707 } 4708 4709 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4710 VPUser &User, 4711 VPTransformState &State) { 4712 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4713 switch (I.getOpcode()) { 4714 case Instruction::Call: 4715 case Instruction::Br: 4716 case Instruction::PHI: 4717 case Instruction::GetElementPtr: 4718 case Instruction::Select: 4719 llvm_unreachable("This instruction is handled by a different recipe."); 4720 case Instruction::UDiv: 4721 case Instruction::SDiv: 4722 case Instruction::SRem: 4723 case Instruction::URem: 4724 case Instruction::Add: 4725 case Instruction::FAdd: 4726 case Instruction::Sub: 4727 case Instruction::FSub: 4728 case Instruction::FNeg: 4729 case Instruction::Mul: 4730 case Instruction::FMul: 4731 case Instruction::FDiv: 4732 case Instruction::FRem: 4733 case Instruction::Shl: 4734 case Instruction::LShr: 4735 case Instruction::AShr: 4736 case Instruction::And: 4737 case Instruction::Or: 4738 case Instruction::Xor: { 4739 // Just widen unops and binops. 4740 setDebugLocFromInst(Builder, &I); 4741 4742 for (unsigned Part = 0; Part < UF; ++Part) { 4743 SmallVector<Value *, 2> Ops; 4744 for (VPValue *VPOp : User.operands()) 4745 Ops.push_back(State.get(VPOp, Part)); 4746 4747 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4748 4749 if (auto *VecOp = dyn_cast<Instruction>(V)) 4750 VecOp->copyIRFlags(&I); 4751 4752 // Use this vector value for all users of the original instruction. 4753 State.set(Def, &I, V, Part); 4754 addMetadata(V, &I); 4755 } 4756 4757 break; 4758 } 4759 case Instruction::ICmp: 4760 case Instruction::FCmp: { 4761 // Widen compares. Generate vector compares. 4762 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4763 auto *Cmp = cast<CmpInst>(&I); 4764 setDebugLocFromInst(Builder, Cmp); 4765 for (unsigned Part = 0; Part < UF; ++Part) { 4766 Value *A = State.get(User.getOperand(0), Part); 4767 Value *B = State.get(User.getOperand(1), Part); 4768 Value *C = nullptr; 4769 if (FCmp) { 4770 // Propagate fast math flags. 4771 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4772 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4773 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4774 } else { 4775 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4776 } 4777 State.set(Def, &I, C, Part); 4778 addMetadata(C, &I); 4779 } 4780 4781 break; 4782 } 4783 4784 case Instruction::ZExt: 4785 case Instruction::SExt: 4786 case Instruction::FPToUI: 4787 case Instruction::FPToSI: 4788 case Instruction::FPExt: 4789 case Instruction::PtrToInt: 4790 case Instruction::IntToPtr: 4791 case Instruction::SIToFP: 4792 case Instruction::UIToFP: 4793 case Instruction::Trunc: 4794 case Instruction::FPTrunc: 4795 case Instruction::BitCast: { 4796 auto *CI = cast<CastInst>(&I); 4797 setDebugLocFromInst(Builder, CI); 4798 4799 /// Vectorize casts. 4800 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4801 Type *DestTy = 4802 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4803 4804 for (unsigned Part = 0; Part < UF; ++Part) { 4805 Value *A = State.get(User.getOperand(0), Part); 4806 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4807 State.set(Def, &I, Cast, Part); 4808 addMetadata(Cast, &I); 4809 } 4810 break; 4811 } 4812 default: 4813 // This instruction is not vectorized by simple widening. 4814 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4815 llvm_unreachable("Unhandled instruction!"); 4816 } // end of switch. 4817 } 4818 4819 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4820 VPUser &ArgOperands, 4821 VPTransformState &State) { 4822 assert(!isa<DbgInfoIntrinsic>(I) && 4823 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4824 setDebugLocFromInst(Builder, &I); 4825 4826 Module *M = I.getParent()->getParent()->getParent(); 4827 auto *CI = cast<CallInst>(&I); 4828 4829 SmallVector<Type *, 4> Tys; 4830 for (Value *ArgOperand : CI->arg_operands()) 4831 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4832 4833 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4834 4835 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4836 // version of the instruction. 4837 // Is it beneficial to perform intrinsic call compared to lib call? 4838 bool NeedToScalarize = false; 4839 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4840 bool UseVectorIntrinsic = 4841 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4842 assert((UseVectorIntrinsic || !NeedToScalarize) && 4843 "Instruction should be scalarized elsewhere."); 4844 4845 for (unsigned Part = 0; Part < UF; ++Part) { 4846 SmallVector<Value *, 4> Args; 4847 for (auto &I : enumerate(ArgOperands.operands())) { 4848 // Some intrinsics have a scalar argument - don't replace it with a 4849 // vector. 4850 Value *Arg; 4851 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4852 Arg = State.get(I.value(), Part); 4853 else 4854 Arg = State.get(I.value(), {0, 0}); 4855 Args.push_back(Arg); 4856 } 4857 4858 Function *VectorF; 4859 if (UseVectorIntrinsic) { 4860 // Use vector version of the intrinsic. 4861 Type *TysForDecl[] = {CI->getType()}; 4862 if (VF.isVector()) { 4863 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4864 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4865 } 4866 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4867 assert(VectorF && "Can't retrieve vector intrinsic."); 4868 } else { 4869 // Use vector version of the function call. 4870 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4871 #ifndef NDEBUG 4872 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4873 "Can't create vector function."); 4874 #endif 4875 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4876 } 4877 SmallVector<OperandBundleDef, 1> OpBundles; 4878 CI->getOperandBundlesAsDefs(OpBundles); 4879 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4880 4881 if (isa<FPMathOperator>(V)) 4882 V->copyFastMathFlags(CI); 4883 4884 State.set(Def, &I, V, Part); 4885 addMetadata(V, &I); 4886 } 4887 } 4888 4889 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4890 VPUser &Operands, 4891 bool InvariantCond, 4892 VPTransformState &State) { 4893 setDebugLocFromInst(Builder, &I); 4894 4895 // The condition can be loop invariant but still defined inside the 4896 // loop. This means that we can't just use the original 'cond' value. 4897 // We have to take the 'vectorized' value and pick the first lane. 4898 // Instcombine will make this a no-op. 4899 auto *InvarCond = 4900 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4901 4902 for (unsigned Part = 0; Part < UF; ++Part) { 4903 Value *Cond = 4904 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4905 Value *Op0 = State.get(Operands.getOperand(1), Part); 4906 Value *Op1 = State.get(Operands.getOperand(2), Part); 4907 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4908 State.set(VPDef, &I, Sel, Part); 4909 addMetadata(Sel, &I); 4910 } 4911 } 4912 4913 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4914 // We should not collect Scalars more than once per VF. Right now, this 4915 // function is called from collectUniformsAndScalars(), which already does 4916 // this check. Collecting Scalars for VF=1 does not make any sense. 4917 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4918 "This function should not be visited twice for the same VF"); 4919 4920 SmallSetVector<Instruction *, 8> Worklist; 4921 4922 // These sets are used to seed the analysis with pointers used by memory 4923 // accesses that will remain scalar. 4924 SmallSetVector<Instruction *, 8> ScalarPtrs; 4925 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4926 auto *Latch = TheLoop->getLoopLatch(); 4927 4928 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4929 // The pointer operands of loads and stores will be scalar as long as the 4930 // memory access is not a gather or scatter operation. The value operand of a 4931 // store will remain scalar if the store is scalarized. 4932 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4933 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4934 assert(WideningDecision != CM_Unknown && 4935 "Widening decision should be ready at this moment"); 4936 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4937 if (Ptr == Store->getValueOperand()) 4938 return WideningDecision == CM_Scalarize; 4939 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4940 "Ptr is neither a value or pointer operand"); 4941 return WideningDecision != CM_GatherScatter; 4942 }; 4943 4944 // A helper that returns true if the given value is a bitcast or 4945 // getelementptr instruction contained in the loop. 4946 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4947 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4948 isa<GetElementPtrInst>(V)) && 4949 !TheLoop->isLoopInvariant(V); 4950 }; 4951 4952 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4953 if (!isa<PHINode>(Ptr) || 4954 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4955 return false; 4956 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4957 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4958 return false; 4959 return isScalarUse(MemAccess, Ptr); 4960 }; 4961 4962 // A helper that evaluates a memory access's use of a pointer. If the 4963 // pointer is actually the pointer induction of a loop, it is being 4964 // inserted into Worklist. If the use will be a scalar use, and the 4965 // pointer is only used by memory accesses, we place the pointer in 4966 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4967 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4968 if (isScalarPtrInduction(MemAccess, Ptr)) { 4969 Worklist.insert(cast<Instruction>(Ptr)); 4970 Instruction *Update = cast<Instruction>( 4971 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4972 Worklist.insert(Update); 4973 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4974 << "\n"); 4975 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4976 << "\n"); 4977 return; 4978 } 4979 // We only care about bitcast and getelementptr instructions contained in 4980 // the loop. 4981 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4982 return; 4983 4984 // If the pointer has already been identified as scalar (e.g., if it was 4985 // also identified as uniform), there's nothing to do. 4986 auto *I = cast<Instruction>(Ptr); 4987 if (Worklist.count(I)) 4988 return; 4989 4990 // If the use of the pointer will be a scalar use, and all users of the 4991 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4992 // place the pointer in PossibleNonScalarPtrs. 4993 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4994 return isa<LoadInst>(U) || isa<StoreInst>(U); 4995 })) 4996 ScalarPtrs.insert(I); 4997 else 4998 PossibleNonScalarPtrs.insert(I); 4999 }; 5000 5001 // We seed the scalars analysis with three classes of instructions: (1) 5002 // instructions marked uniform-after-vectorization and (2) bitcast, 5003 // getelementptr and (pointer) phi instructions used by memory accesses 5004 // requiring a scalar use. 5005 // 5006 // (1) Add to the worklist all instructions that have been identified as 5007 // uniform-after-vectorization. 5008 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5009 5010 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5011 // memory accesses requiring a scalar use. The pointer operands of loads and 5012 // stores will be scalar as long as the memory accesses is not a gather or 5013 // scatter operation. The value operand of a store will remain scalar if the 5014 // store is scalarized. 5015 for (auto *BB : TheLoop->blocks()) 5016 for (auto &I : *BB) { 5017 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5018 evaluatePtrUse(Load, Load->getPointerOperand()); 5019 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5020 evaluatePtrUse(Store, Store->getPointerOperand()); 5021 evaluatePtrUse(Store, Store->getValueOperand()); 5022 } 5023 } 5024 for (auto *I : ScalarPtrs) 5025 if (!PossibleNonScalarPtrs.count(I)) { 5026 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5027 Worklist.insert(I); 5028 } 5029 5030 // Insert the forced scalars. 5031 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5032 // induction variable when the PHI user is scalarized. 5033 auto ForcedScalar = ForcedScalars.find(VF); 5034 if (ForcedScalar != ForcedScalars.end()) 5035 for (auto *I : ForcedScalar->second) 5036 Worklist.insert(I); 5037 5038 // Expand the worklist by looking through any bitcasts and getelementptr 5039 // instructions we've already identified as scalar. This is similar to the 5040 // expansion step in collectLoopUniforms(); however, here we're only 5041 // expanding to include additional bitcasts and getelementptr instructions. 5042 unsigned Idx = 0; 5043 while (Idx != Worklist.size()) { 5044 Instruction *Dst = Worklist[Idx++]; 5045 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5046 continue; 5047 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5048 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5049 auto *J = cast<Instruction>(U); 5050 return !TheLoop->contains(J) || Worklist.count(J) || 5051 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5052 isScalarUse(J, Src)); 5053 })) { 5054 Worklist.insert(Src); 5055 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5056 } 5057 } 5058 5059 // An induction variable will remain scalar if all users of the induction 5060 // variable and induction variable update remain scalar. 5061 for (auto &Induction : Legal->getInductionVars()) { 5062 auto *Ind = Induction.first; 5063 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5064 5065 // If tail-folding is applied, the primary induction variable will be used 5066 // to feed a vector compare. 5067 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5068 continue; 5069 5070 // Determine if all users of the induction variable are scalar after 5071 // vectorization. 5072 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5073 auto *I = cast<Instruction>(U); 5074 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5075 }); 5076 if (!ScalarInd) 5077 continue; 5078 5079 // Determine if all users of the induction variable update instruction are 5080 // scalar after vectorization. 5081 auto ScalarIndUpdate = 5082 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5083 auto *I = cast<Instruction>(U); 5084 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5085 }); 5086 if (!ScalarIndUpdate) 5087 continue; 5088 5089 // The induction variable and its update instruction will remain scalar. 5090 Worklist.insert(Ind); 5091 Worklist.insert(IndUpdate); 5092 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5093 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5094 << "\n"); 5095 } 5096 5097 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5098 } 5099 5100 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5101 ElementCount VF) { 5102 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5103 if (!blockNeedsPredication(I->getParent())) 5104 return false; 5105 switch(I->getOpcode()) { 5106 default: 5107 break; 5108 case Instruction::Load: 5109 case Instruction::Store: { 5110 if (!Legal->isMaskRequired(I)) 5111 return false; 5112 auto *Ptr = getLoadStorePointerOperand(I); 5113 auto *Ty = getMemInstValueType(I); 5114 // We have already decided how to vectorize this instruction, get that 5115 // result. 5116 if (VF.isVector()) { 5117 InstWidening WideningDecision = getWideningDecision(I, VF); 5118 assert(WideningDecision != CM_Unknown && 5119 "Widening decision should be ready at this moment"); 5120 return WideningDecision == CM_Scalarize; 5121 } 5122 const Align Alignment = getLoadStoreAlignment(I); 5123 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5124 isLegalMaskedGather(Ty, Alignment)) 5125 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5126 isLegalMaskedScatter(Ty, Alignment)); 5127 } 5128 case Instruction::UDiv: 5129 case Instruction::SDiv: 5130 case Instruction::SRem: 5131 case Instruction::URem: 5132 return mayDivideByZero(*I); 5133 } 5134 return false; 5135 } 5136 5137 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5138 Instruction *I, ElementCount VF) { 5139 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5140 assert(getWideningDecision(I, VF) == CM_Unknown && 5141 "Decision should not be set yet."); 5142 auto *Group = getInterleavedAccessGroup(I); 5143 assert(Group && "Must have a group."); 5144 5145 // If the instruction's allocated size doesn't equal it's type size, it 5146 // requires padding and will be scalarized. 5147 auto &DL = I->getModule()->getDataLayout(); 5148 auto *ScalarTy = getMemInstValueType(I); 5149 if (hasIrregularType(ScalarTy, DL, VF)) 5150 return false; 5151 5152 // Check if masking is required. 5153 // A Group may need masking for one of two reasons: it resides in a block that 5154 // needs predication, or it was decided to use masking to deal with gaps. 5155 bool PredicatedAccessRequiresMasking = 5156 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5157 bool AccessWithGapsRequiresMasking = 5158 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5159 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5160 return true; 5161 5162 // If masked interleaving is required, we expect that the user/target had 5163 // enabled it, because otherwise it either wouldn't have been created or 5164 // it should have been invalidated by the CostModel. 5165 assert(useMaskedInterleavedAccesses(TTI) && 5166 "Masked interleave-groups for predicated accesses are not enabled."); 5167 5168 auto *Ty = getMemInstValueType(I); 5169 const Align Alignment = getLoadStoreAlignment(I); 5170 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5171 : TTI.isLegalMaskedStore(Ty, Alignment); 5172 } 5173 5174 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5175 Instruction *I, ElementCount VF) { 5176 // Get and ensure we have a valid memory instruction. 5177 LoadInst *LI = dyn_cast<LoadInst>(I); 5178 StoreInst *SI = dyn_cast<StoreInst>(I); 5179 assert((LI || SI) && "Invalid memory instruction"); 5180 5181 auto *Ptr = getLoadStorePointerOperand(I); 5182 5183 // In order to be widened, the pointer should be consecutive, first of all. 5184 if (!Legal->isConsecutivePtr(Ptr)) 5185 return false; 5186 5187 // If the instruction is a store located in a predicated block, it will be 5188 // scalarized. 5189 if (isScalarWithPredication(I)) 5190 return false; 5191 5192 // If the instruction's allocated size doesn't equal it's type size, it 5193 // requires padding and will be scalarized. 5194 auto &DL = I->getModule()->getDataLayout(); 5195 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5196 if (hasIrregularType(ScalarTy, DL, VF)) 5197 return false; 5198 5199 return true; 5200 } 5201 5202 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5203 // We should not collect Uniforms more than once per VF. Right now, 5204 // this function is called from collectUniformsAndScalars(), which 5205 // already does this check. Collecting Uniforms for VF=1 does not make any 5206 // sense. 5207 5208 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5209 "This function should not be visited twice for the same VF"); 5210 5211 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5212 // not analyze again. Uniforms.count(VF) will return 1. 5213 Uniforms[VF].clear(); 5214 5215 // We now know that the loop is vectorizable! 5216 // Collect instructions inside the loop that will remain uniform after 5217 // vectorization. 5218 5219 // Global values, params and instructions outside of current loop are out of 5220 // scope. 5221 auto isOutOfScope = [&](Value *V) -> bool { 5222 Instruction *I = dyn_cast<Instruction>(V); 5223 return (!I || !TheLoop->contains(I)); 5224 }; 5225 5226 SetVector<Instruction *> Worklist; 5227 BasicBlock *Latch = TheLoop->getLoopLatch(); 5228 5229 // Instructions that are scalar with predication must not be considered 5230 // uniform after vectorization, because that would create an erroneous 5231 // replicating region where only a single instance out of VF should be formed. 5232 // TODO: optimize such seldom cases if found important, see PR40816. 5233 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5234 if (isOutOfScope(I)) { 5235 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5236 << *I << "\n"); 5237 return; 5238 } 5239 if (isScalarWithPredication(I, VF)) { 5240 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5241 << *I << "\n"); 5242 return; 5243 } 5244 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5245 Worklist.insert(I); 5246 }; 5247 5248 // Start with the conditional branch. If the branch condition is an 5249 // instruction contained in the loop that is only used by the branch, it is 5250 // uniform. 5251 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5252 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5253 addToWorklistIfAllowed(Cmp); 5254 5255 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5256 InstWidening WideningDecision = getWideningDecision(I, VF); 5257 assert(WideningDecision != CM_Unknown && 5258 "Widening decision should be ready at this moment"); 5259 5260 // A uniform memory op is itself uniform. We exclude uniform stores 5261 // here as they demand the last lane, not the first one. 5262 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5263 assert(WideningDecision == CM_Scalarize); 5264 return true; 5265 } 5266 5267 return (WideningDecision == CM_Widen || 5268 WideningDecision == CM_Widen_Reverse || 5269 WideningDecision == CM_Interleave); 5270 }; 5271 5272 5273 // Returns true if Ptr is the pointer operand of a memory access instruction 5274 // I, and I is known to not require scalarization. 5275 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5276 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5277 }; 5278 5279 // Holds a list of values which are known to have at least one uniform use. 5280 // Note that there may be other uses which aren't uniform. A "uniform use" 5281 // here is something which only demands lane 0 of the unrolled iterations; 5282 // it does not imply that all lanes produce the same value (e.g. this is not 5283 // the usual meaning of uniform) 5284 SmallPtrSet<Value *, 8> HasUniformUse; 5285 5286 // Scan the loop for instructions which are either a) known to have only 5287 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5288 for (auto *BB : TheLoop->blocks()) 5289 for (auto &I : *BB) { 5290 // If there's no pointer operand, there's nothing to do. 5291 auto *Ptr = getLoadStorePointerOperand(&I); 5292 if (!Ptr) 5293 continue; 5294 5295 // A uniform memory op is itself uniform. We exclude uniform stores 5296 // here as they demand the last lane, not the first one. 5297 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5298 addToWorklistIfAllowed(&I); 5299 5300 if (isUniformDecision(&I, VF)) { 5301 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5302 HasUniformUse.insert(Ptr); 5303 } 5304 } 5305 5306 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5307 // demanding) users. Since loops are assumed to be in LCSSA form, this 5308 // disallows uses outside the loop as well. 5309 for (auto *V : HasUniformUse) { 5310 if (isOutOfScope(V)) 5311 continue; 5312 auto *I = cast<Instruction>(V); 5313 auto UsersAreMemAccesses = 5314 llvm::all_of(I->users(), [&](User *U) -> bool { 5315 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5316 }); 5317 if (UsersAreMemAccesses) 5318 addToWorklistIfAllowed(I); 5319 } 5320 5321 // Expand Worklist in topological order: whenever a new instruction 5322 // is added , its users should be already inside Worklist. It ensures 5323 // a uniform instruction will only be used by uniform instructions. 5324 unsigned idx = 0; 5325 while (idx != Worklist.size()) { 5326 Instruction *I = Worklist[idx++]; 5327 5328 for (auto OV : I->operand_values()) { 5329 // isOutOfScope operands cannot be uniform instructions. 5330 if (isOutOfScope(OV)) 5331 continue; 5332 // First order recurrence Phi's should typically be considered 5333 // non-uniform. 5334 auto *OP = dyn_cast<PHINode>(OV); 5335 if (OP && Legal->isFirstOrderRecurrence(OP)) 5336 continue; 5337 // If all the users of the operand are uniform, then add the 5338 // operand into the uniform worklist. 5339 auto *OI = cast<Instruction>(OV); 5340 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5341 auto *J = cast<Instruction>(U); 5342 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5343 })) 5344 addToWorklistIfAllowed(OI); 5345 } 5346 } 5347 5348 // For an instruction to be added into Worklist above, all its users inside 5349 // the loop should also be in Worklist. However, this condition cannot be 5350 // true for phi nodes that form a cyclic dependence. We must process phi 5351 // nodes separately. An induction variable will remain uniform if all users 5352 // of the induction variable and induction variable update remain uniform. 5353 // The code below handles both pointer and non-pointer induction variables. 5354 for (auto &Induction : Legal->getInductionVars()) { 5355 auto *Ind = Induction.first; 5356 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5357 5358 // Determine if all users of the induction variable are uniform after 5359 // vectorization. 5360 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5361 auto *I = cast<Instruction>(U); 5362 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5363 isVectorizedMemAccessUse(I, Ind); 5364 }); 5365 if (!UniformInd) 5366 continue; 5367 5368 // Determine if all users of the induction variable update instruction are 5369 // uniform after vectorization. 5370 auto UniformIndUpdate = 5371 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5372 auto *I = cast<Instruction>(U); 5373 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5374 isVectorizedMemAccessUse(I, IndUpdate); 5375 }); 5376 if (!UniformIndUpdate) 5377 continue; 5378 5379 // The induction variable and its update instruction will remain uniform. 5380 addToWorklistIfAllowed(Ind); 5381 addToWorklistIfAllowed(IndUpdate); 5382 } 5383 5384 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5385 } 5386 5387 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5388 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5389 5390 if (Legal->getRuntimePointerChecking()->Need) { 5391 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5392 "runtime pointer checks needed. Enable vectorization of this " 5393 "loop with '#pragma clang loop vectorize(enable)' when " 5394 "compiling with -Os/-Oz", 5395 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5396 return true; 5397 } 5398 5399 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5400 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5401 "runtime SCEV checks needed. Enable vectorization of this " 5402 "loop with '#pragma clang loop vectorize(enable)' when " 5403 "compiling with -Os/-Oz", 5404 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5405 return true; 5406 } 5407 5408 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5409 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5410 reportVectorizationFailure("Runtime stride check for small trip count", 5411 "runtime stride == 1 checks needed. Enable vectorization of " 5412 "this loop without such check by compiling with -Os/-Oz", 5413 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5414 return true; 5415 } 5416 5417 return false; 5418 } 5419 5420 Optional<ElementCount> 5421 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5422 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5423 // TODO: It may by useful to do since it's still likely to be dynamically 5424 // uniform if the target can skip. 5425 reportVectorizationFailure( 5426 "Not inserting runtime ptr check for divergent target", 5427 "runtime pointer checks needed. Not enabled for divergent target", 5428 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5429 return None; 5430 } 5431 5432 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5433 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5434 if (TC == 1) { 5435 reportVectorizationFailure("Single iteration (non) loop", 5436 "loop trip count is one, irrelevant for vectorization", 5437 "SingleIterationLoop", ORE, TheLoop); 5438 return None; 5439 } 5440 5441 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5442 5443 switch (ScalarEpilogueStatus) { 5444 case CM_ScalarEpilogueAllowed: 5445 return MaxVF; 5446 case CM_ScalarEpilogueNotNeededUsePredicate: 5447 LLVM_DEBUG( 5448 dbgs() << "LV: vector predicate hint/switch found.\n" 5449 << "LV: Not allowing scalar epilogue, creating predicated " 5450 << "vector loop.\n"); 5451 break; 5452 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5453 // fallthrough as a special case of OptForSize 5454 case CM_ScalarEpilogueNotAllowedOptSize: 5455 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5456 LLVM_DEBUG( 5457 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5458 else 5459 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5460 << "count.\n"); 5461 5462 // Bail if runtime checks are required, which are not good when optimising 5463 // for size. 5464 if (runtimeChecksRequired()) 5465 return None; 5466 break; 5467 } 5468 5469 // Now try the tail folding 5470 5471 // Invalidate interleave groups that require an epilogue if we can't mask 5472 // the interleave-group. 5473 if (!useMaskedInterleavedAccesses(TTI)) { 5474 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5475 "No decisions should have been taken at this point"); 5476 // Note: There is no need to invalidate any cost modeling decisions here, as 5477 // non where taken so far. 5478 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5479 } 5480 5481 assert(!MaxVF.isScalable() && 5482 "Scalable vectors do not yet support tail folding"); 5483 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5484 "MaxVF must be a power of 2"); 5485 unsigned MaxVFtimesIC = 5486 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5487 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5488 // Accept MaxVF if we do not have a tail. 5489 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5490 return MaxVF; 5491 } 5492 5493 // If we don't know the precise trip count, or if the trip count that we 5494 // found modulo the vectorization factor is not zero, try to fold the tail 5495 // by masking. 5496 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5497 if (Legal->prepareToFoldTailByMasking()) { 5498 FoldTailByMasking = true; 5499 return MaxVF; 5500 } 5501 5502 // If there was a tail-folding hint/switch, but we can't fold the tail by 5503 // masking, fallback to a vectorization with a scalar epilogue. 5504 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5505 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5506 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5507 return None; 5508 } 5509 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5510 "scalar epilogue instead.\n"); 5511 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5512 return MaxVF; 5513 } 5514 5515 if (TC == 0) { 5516 reportVectorizationFailure( 5517 "Unable to calculate the loop count due to complex control flow", 5518 "unable to calculate the loop count due to complex control flow", 5519 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5520 return None; 5521 } 5522 5523 reportVectorizationFailure( 5524 "Cannot optimize for size and vectorize at the same time.", 5525 "cannot optimize for size and vectorize at the same time. " 5526 "Enable vectorization of this loop with '#pragma clang loop " 5527 "vectorize(enable)' when compiling with -Os/-Oz", 5528 "NoTailLoopWithOptForSize", ORE, TheLoop); 5529 return None; 5530 } 5531 5532 ElementCount 5533 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5534 ElementCount UserVF) { 5535 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 5536 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5537 unsigned SmallestType, WidestType; 5538 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5539 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5540 5541 // Get the maximum safe dependence distance in bits computed by LAA. 5542 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5543 // the memory accesses that is most restrictive (involved in the smallest 5544 // dependence distance). 5545 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5546 5547 if (UserVF.isNonZero()) { 5548 // If legally unsafe, clamp the user vectorization factor to a safe value. 5549 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5550 if (UserVF.getFixedValue() <= MaxSafeVF) 5551 return UserVF; 5552 5553 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5554 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5555 << ".\n"); 5556 ORE->emit([&]() { 5557 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5558 TheLoop->getStartLoc(), 5559 TheLoop->getHeader()) 5560 << "User-specified vectorization factor " 5561 << ore::NV("UserVectorizationFactor", UserVF) 5562 << " is unsafe, clamping to maximum safe vectorization factor " 5563 << ore::NV("VectorizationFactor", MaxSafeVF); 5564 }); 5565 return ElementCount::getFixed(MaxSafeVF); 5566 } 5567 5568 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5569 5570 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5571 // Note that both WidestRegister and WidestType may not be a powers of 2. 5572 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5573 5574 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5575 << " / " << WidestType << " bits.\n"); 5576 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5577 << WidestRegister << " bits.\n"); 5578 5579 assert(MaxVectorSize <= WidestRegister && 5580 "Did not expect to pack so many elements" 5581 " into one vector!"); 5582 if (MaxVectorSize == 0) { 5583 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5584 MaxVectorSize = 1; 5585 return ElementCount::getFixed(MaxVectorSize); 5586 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5587 isPowerOf2_32(ConstTripCount)) { 5588 // We need to clamp the VF to be the ConstTripCount. There is no point in 5589 // choosing a higher viable VF as done in the loop below. 5590 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5591 << ConstTripCount << "\n"); 5592 MaxVectorSize = ConstTripCount; 5593 return ElementCount::getFixed(MaxVectorSize); 5594 } 5595 5596 unsigned MaxVF = MaxVectorSize; 5597 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5598 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5599 // Collect all viable vectorization factors larger than the default MaxVF 5600 // (i.e. MaxVectorSize). 5601 SmallVector<ElementCount, 8> VFs; 5602 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5603 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5604 VFs.push_back(ElementCount::getFixed(VS)); 5605 5606 // For each VF calculate its register usage. 5607 auto RUs = calculateRegisterUsage(VFs); 5608 5609 // Select the largest VF which doesn't require more registers than existing 5610 // ones. 5611 for (int i = RUs.size() - 1; i >= 0; --i) { 5612 bool Selected = true; 5613 for (auto& pair : RUs[i].MaxLocalUsers) { 5614 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5615 if (pair.second > TargetNumRegisters) 5616 Selected = false; 5617 } 5618 if (Selected) { 5619 MaxVF = VFs[i].getKnownMinValue(); 5620 break; 5621 } 5622 } 5623 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5624 if (MaxVF < MinVF) { 5625 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5626 << ") with target's minimum: " << MinVF << '\n'); 5627 MaxVF = MinVF; 5628 } 5629 } 5630 } 5631 return ElementCount::getFixed(MaxVF); 5632 } 5633 5634 VectorizationFactor 5635 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5636 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5637 5638 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5639 const float ScalarCost = Cost; 5640 unsigned Width = 1; 5641 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5642 5643 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5644 if (ForceVectorization && MaxVF.isVector()) { 5645 // Ignore scalar width, because the user explicitly wants vectorization. 5646 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5647 // evaluation. 5648 Cost = std::numeric_limits<float>::max(); 5649 } 5650 5651 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5652 // Notice that the vector loop needs to be executed less times, so 5653 // we need to divide the cost of the vector loops by the width of 5654 // the vector elements. 5655 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5656 float VectorCost = C.first / (float)i; 5657 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5658 << " costs: " << (int)VectorCost << ".\n"); 5659 if (!C.second && !ForceVectorization) { 5660 LLVM_DEBUG( 5661 dbgs() << "LV: Not considering vector loop of width " << i 5662 << " because it will not generate any vector instructions.\n"); 5663 continue; 5664 } 5665 5666 // If profitable add it to ProfitableVF list. 5667 if (VectorCost < ScalarCost) { 5668 ProfitableVFs.push_back(VectorizationFactor( 5669 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5670 } 5671 5672 if (VectorCost < Cost) { 5673 Cost = VectorCost; 5674 Width = i; 5675 } 5676 } 5677 5678 if (!EnableCondStoresVectorization && NumPredStores) { 5679 reportVectorizationFailure("There are conditional stores.", 5680 "store that is conditionally executed prevents vectorization", 5681 "ConditionalStore", ORE, TheLoop); 5682 Width = 1; 5683 Cost = ScalarCost; 5684 } 5685 5686 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5687 << "LV: Vectorization seems to be not beneficial, " 5688 << "but was forced by a user.\n"); 5689 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5690 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5691 (unsigned)(Width * Cost)}; 5692 return Factor; 5693 } 5694 5695 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5696 const Loop &L, ElementCount VF) const { 5697 // Cross iteration phis such as reductions need special handling and are 5698 // currently unsupported. 5699 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5700 return Legal->isFirstOrderRecurrence(&Phi) || 5701 Legal->isReductionVariable(&Phi); 5702 })) 5703 return false; 5704 5705 // Phis with uses outside of the loop require special handling and are 5706 // currently unsupported. 5707 for (auto &Entry : Legal->getInductionVars()) { 5708 // Look for uses of the value of the induction at the last iteration. 5709 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5710 for (User *U : PostInc->users()) 5711 if (!L.contains(cast<Instruction>(U))) 5712 return false; 5713 // Look for uses of penultimate value of the induction. 5714 for (User *U : Entry.first->users()) 5715 if (!L.contains(cast<Instruction>(U))) 5716 return false; 5717 } 5718 5719 // Induction variables that are widened require special handling that is 5720 // currently not supported. 5721 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5722 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5723 this->isProfitableToScalarize(Entry.first, VF)); 5724 })) 5725 return false; 5726 5727 return true; 5728 } 5729 5730 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5731 const ElementCount VF) const { 5732 // FIXME: We need a much better cost-model to take different parameters such 5733 // as register pressure, code size increase and cost of extra branches into 5734 // account. For now we apply a very crude heuristic and only consider loops 5735 // with vectorization factors larger than a certain value. 5736 // We also consider epilogue vectorization unprofitable for targets that don't 5737 // consider interleaving beneficial (eg. MVE). 5738 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5739 return false; 5740 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5741 return true; 5742 return false; 5743 } 5744 5745 VectorizationFactor 5746 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5747 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5748 VectorizationFactor Result = VectorizationFactor::Disabled(); 5749 if (!EnableEpilogueVectorization) { 5750 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5751 return Result; 5752 } 5753 5754 if (!isScalarEpilogueAllowed()) { 5755 LLVM_DEBUG( 5756 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5757 "allowed.\n";); 5758 return Result; 5759 } 5760 5761 // Not really a cost consideration, but check for unsupported cases here to 5762 // simplify the logic. 5763 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5764 LLVM_DEBUG( 5765 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5766 "not a supported candidate.\n";); 5767 return Result; 5768 } 5769 5770 if (EpilogueVectorizationForceVF > 1) { 5771 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5772 if (LVP.hasPlanWithVFs( 5773 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5774 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5775 else { 5776 LLVM_DEBUG( 5777 dbgs() 5778 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5779 return Result; 5780 } 5781 } 5782 5783 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5784 TheLoop->getHeader()->getParent()->hasMinSize()) { 5785 LLVM_DEBUG( 5786 dbgs() 5787 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5788 return Result; 5789 } 5790 5791 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5792 return Result; 5793 5794 for (auto &NextVF : ProfitableVFs) 5795 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5796 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5797 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5798 Result = NextVF; 5799 5800 if (Result != VectorizationFactor::Disabled()) 5801 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5802 << Result.Width.getFixedValue() << "\n";); 5803 return Result; 5804 } 5805 5806 std::pair<unsigned, unsigned> 5807 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5808 unsigned MinWidth = -1U; 5809 unsigned MaxWidth = 8; 5810 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5811 5812 // For each block. 5813 for (BasicBlock *BB : TheLoop->blocks()) { 5814 // For each instruction in the loop. 5815 for (Instruction &I : BB->instructionsWithoutDebug()) { 5816 Type *T = I.getType(); 5817 5818 // Skip ignored values. 5819 if (ValuesToIgnore.count(&I)) 5820 continue; 5821 5822 // Only examine Loads, Stores and PHINodes. 5823 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5824 continue; 5825 5826 // Examine PHI nodes that are reduction variables. Update the type to 5827 // account for the recurrence type. 5828 if (auto *PN = dyn_cast<PHINode>(&I)) { 5829 if (!Legal->isReductionVariable(PN)) 5830 continue; 5831 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5832 T = RdxDesc.getRecurrenceType(); 5833 } 5834 5835 // Examine the stored values. 5836 if (auto *ST = dyn_cast<StoreInst>(&I)) 5837 T = ST->getValueOperand()->getType(); 5838 5839 // Ignore loaded pointer types and stored pointer types that are not 5840 // vectorizable. 5841 // 5842 // FIXME: The check here attempts to predict whether a load or store will 5843 // be vectorized. We only know this for certain after a VF has 5844 // been selected. Here, we assume that if an access can be 5845 // vectorized, it will be. We should also look at extending this 5846 // optimization to non-pointer types. 5847 // 5848 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5849 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5850 continue; 5851 5852 MinWidth = std::min(MinWidth, 5853 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5854 MaxWidth = std::max(MaxWidth, 5855 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5856 } 5857 } 5858 5859 return {MinWidth, MaxWidth}; 5860 } 5861 5862 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5863 unsigned LoopCost) { 5864 // -- The interleave heuristics -- 5865 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5866 // There are many micro-architectural considerations that we can't predict 5867 // at this level. For example, frontend pressure (on decode or fetch) due to 5868 // code size, or the number and capabilities of the execution ports. 5869 // 5870 // We use the following heuristics to select the interleave count: 5871 // 1. If the code has reductions, then we interleave to break the cross 5872 // iteration dependency. 5873 // 2. If the loop is really small, then we interleave to reduce the loop 5874 // overhead. 5875 // 3. We don't interleave if we think that we will spill registers to memory 5876 // due to the increased register pressure. 5877 5878 if (!isScalarEpilogueAllowed()) 5879 return 1; 5880 5881 // We used the distance for the interleave count. 5882 if (Legal->getMaxSafeDepDistBytes() != -1U) 5883 return 1; 5884 5885 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5886 const bool HasReductions = !Legal->getReductionVars().empty(); 5887 // Do not interleave loops with a relatively small known or estimated trip 5888 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5889 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5890 // because with the above conditions interleaving can expose ILP and break 5891 // cross iteration dependences for reductions. 5892 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5893 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5894 return 1; 5895 5896 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5897 // We divide by these constants so assume that we have at least one 5898 // instruction that uses at least one register. 5899 for (auto& pair : R.MaxLocalUsers) { 5900 pair.second = std::max(pair.second, 1U); 5901 } 5902 5903 // We calculate the interleave count using the following formula. 5904 // Subtract the number of loop invariants from the number of available 5905 // registers. These registers are used by all of the interleaved instances. 5906 // Next, divide the remaining registers by the number of registers that is 5907 // required by the loop, in order to estimate how many parallel instances 5908 // fit without causing spills. All of this is rounded down if necessary to be 5909 // a power of two. We want power of two interleave count to simplify any 5910 // addressing operations or alignment considerations. 5911 // We also want power of two interleave counts to ensure that the induction 5912 // variable of the vector loop wraps to zero, when tail is folded by masking; 5913 // this currently happens when OptForSize, in which case IC is set to 1 above. 5914 unsigned IC = UINT_MAX; 5915 5916 for (auto& pair : R.MaxLocalUsers) { 5917 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5918 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5919 << " registers of " 5920 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5921 if (VF.isScalar()) { 5922 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5923 TargetNumRegisters = ForceTargetNumScalarRegs; 5924 } else { 5925 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5926 TargetNumRegisters = ForceTargetNumVectorRegs; 5927 } 5928 unsigned MaxLocalUsers = pair.second; 5929 unsigned LoopInvariantRegs = 0; 5930 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5931 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5932 5933 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5934 // Don't count the induction variable as interleaved. 5935 if (EnableIndVarRegisterHeur) { 5936 TmpIC = 5937 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5938 std::max(1U, (MaxLocalUsers - 1))); 5939 } 5940 5941 IC = std::min(IC, TmpIC); 5942 } 5943 5944 // Clamp the interleave ranges to reasonable counts. 5945 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5946 unsigned MaxInterleaveCount = 5947 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5948 5949 // Check if the user has overridden the max. 5950 if (VF.isScalar()) { 5951 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5952 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5953 } else { 5954 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5955 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5956 } 5957 5958 // If trip count is known or estimated compile time constant, limit the 5959 // interleave count to be less than the trip count divided by VF, provided it 5960 // is at least 1. 5961 if (BestKnownTC) { 5962 MaxInterleaveCount = 5963 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5964 // Make sure MaxInterleaveCount is greater than 0. 5965 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5966 } 5967 5968 assert(MaxInterleaveCount > 0 && 5969 "Maximum interleave count must be greater than 0"); 5970 5971 // Clamp the calculated IC to be between the 1 and the max interleave count 5972 // that the target and trip count allows. 5973 if (IC > MaxInterleaveCount) 5974 IC = MaxInterleaveCount; 5975 else 5976 // Make sure IC is greater than 0. 5977 IC = std::max(1u, IC); 5978 5979 assert(IC > 0 && "Interleave count must be greater than 0."); 5980 5981 // If we did not calculate the cost for VF (because the user selected the VF) 5982 // then we calculate the cost of VF here. 5983 if (LoopCost == 0) 5984 LoopCost = expectedCost(VF).first; 5985 5986 assert(LoopCost && "Non-zero loop cost expected"); 5987 5988 // Interleave if we vectorized this loop and there is a reduction that could 5989 // benefit from interleaving. 5990 if (VF.isVector() && HasReductions) { 5991 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5992 return IC; 5993 } 5994 5995 // Note that if we've already vectorized the loop we will have done the 5996 // runtime check and so interleaving won't require further checks. 5997 bool InterleavingRequiresRuntimePointerCheck = 5998 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5999 6000 // We want to interleave small loops in order to reduce the loop overhead and 6001 // potentially expose ILP opportunities. 6002 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6003 << "LV: IC is " << IC << '\n' 6004 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 6005 const bool AggressivelyInterleaveReductions = 6006 TTI.enableAggressiveInterleaving(HasReductions); 6007 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6008 // We assume that the cost overhead is 1 and we use the cost model 6009 // to estimate the cost of the loop and interleave until the cost of the 6010 // loop overhead is about 5% of the cost of the loop. 6011 unsigned SmallIC = 6012 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6013 6014 // Interleave until store/load ports (estimated by max interleave count) are 6015 // saturated. 6016 unsigned NumStores = Legal->getNumStores(); 6017 unsigned NumLoads = Legal->getNumLoads(); 6018 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6019 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6020 6021 // If we have a scalar reduction (vector reductions are already dealt with 6022 // by this point), we can increase the critical path length if the loop 6023 // we're interleaving is inside another loop. Limit, by default to 2, so the 6024 // critical path only gets increased by one reduction operation. 6025 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6026 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6027 SmallIC = std::min(SmallIC, F); 6028 StoresIC = std::min(StoresIC, F); 6029 LoadsIC = std::min(LoadsIC, F); 6030 } 6031 6032 if (EnableLoadStoreRuntimeInterleave && 6033 std::max(StoresIC, LoadsIC) > SmallIC) { 6034 LLVM_DEBUG( 6035 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6036 return std::max(StoresIC, LoadsIC); 6037 } 6038 6039 // If there are scalar reductions and TTI has enabled aggressive 6040 // interleaving for reductions, we will interleave to expose ILP. 6041 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6042 AggressivelyInterleaveReductions) { 6043 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6044 // Interleave no less than SmallIC but not as aggressive as the normal IC 6045 // to satisfy the rare situation when resources are too limited. 6046 return std::max(IC / 2, SmallIC); 6047 } else { 6048 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6049 return SmallIC; 6050 } 6051 } 6052 6053 // Interleave if this is a large loop (small loops are already dealt with by 6054 // this point) that could benefit from interleaving. 6055 if (AggressivelyInterleaveReductions) { 6056 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6057 return IC; 6058 } 6059 6060 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6061 return 1; 6062 } 6063 6064 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6065 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6066 // This function calculates the register usage by measuring the highest number 6067 // of values that are alive at a single location. Obviously, this is a very 6068 // rough estimation. We scan the loop in a topological order in order and 6069 // assign a number to each instruction. We use RPO to ensure that defs are 6070 // met before their users. We assume that each instruction that has in-loop 6071 // users starts an interval. We record every time that an in-loop value is 6072 // used, so we have a list of the first and last occurrences of each 6073 // instruction. Next, we transpose this data structure into a multi map that 6074 // holds the list of intervals that *end* at a specific location. This multi 6075 // map allows us to perform a linear search. We scan the instructions linearly 6076 // and record each time that a new interval starts, by placing it in a set. 6077 // If we find this value in the multi-map then we remove it from the set. 6078 // The max register usage is the maximum size of the set. 6079 // We also search for instructions that are defined outside the loop, but are 6080 // used inside the loop. We need this number separately from the max-interval 6081 // usage number because when we unroll, loop-invariant values do not take 6082 // more register. 6083 LoopBlocksDFS DFS(TheLoop); 6084 DFS.perform(LI); 6085 6086 RegisterUsage RU; 6087 6088 // Each 'key' in the map opens a new interval. The values 6089 // of the map are the index of the 'last seen' usage of the 6090 // instruction that is the key. 6091 using IntervalMap = DenseMap<Instruction *, unsigned>; 6092 6093 // Maps instruction to its index. 6094 SmallVector<Instruction *, 64> IdxToInstr; 6095 // Marks the end of each interval. 6096 IntervalMap EndPoint; 6097 // Saves the list of instruction indices that are used in the loop. 6098 SmallPtrSet<Instruction *, 8> Ends; 6099 // Saves the list of values that are used in the loop but are 6100 // defined outside the loop, such as arguments and constants. 6101 SmallPtrSet<Value *, 8> LoopInvariants; 6102 6103 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6104 for (Instruction &I : BB->instructionsWithoutDebug()) { 6105 IdxToInstr.push_back(&I); 6106 6107 // Save the end location of each USE. 6108 for (Value *U : I.operands()) { 6109 auto *Instr = dyn_cast<Instruction>(U); 6110 6111 // Ignore non-instruction values such as arguments, constants, etc. 6112 if (!Instr) 6113 continue; 6114 6115 // If this instruction is outside the loop then record it and continue. 6116 if (!TheLoop->contains(Instr)) { 6117 LoopInvariants.insert(Instr); 6118 continue; 6119 } 6120 6121 // Overwrite previous end points. 6122 EndPoint[Instr] = IdxToInstr.size(); 6123 Ends.insert(Instr); 6124 } 6125 } 6126 } 6127 6128 // Saves the list of intervals that end with the index in 'key'. 6129 using InstrList = SmallVector<Instruction *, 2>; 6130 DenseMap<unsigned, InstrList> TransposeEnds; 6131 6132 // Transpose the EndPoints to a list of values that end at each index. 6133 for (auto &Interval : EndPoint) 6134 TransposeEnds[Interval.second].push_back(Interval.first); 6135 6136 SmallPtrSet<Instruction *, 8> OpenIntervals; 6137 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6138 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6139 6140 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6141 6142 // A lambda that gets the register usage for the given type and VF. 6143 const auto &TTICapture = TTI; 6144 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6145 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6146 return 0U; 6147 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6148 }; 6149 6150 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6151 Instruction *I = IdxToInstr[i]; 6152 6153 // Remove all of the instructions that end at this location. 6154 InstrList &List = TransposeEnds[i]; 6155 for (Instruction *ToRemove : List) 6156 OpenIntervals.erase(ToRemove); 6157 6158 // Ignore instructions that are never used within the loop. 6159 if (!Ends.count(I)) 6160 continue; 6161 6162 // Skip ignored values. 6163 if (ValuesToIgnore.count(I)) 6164 continue; 6165 6166 // For each VF find the maximum usage of registers. 6167 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6168 // Count the number of live intervals. 6169 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6170 6171 if (VFs[j].isScalar()) { 6172 for (auto Inst : OpenIntervals) { 6173 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6174 if (RegUsage.find(ClassID) == RegUsage.end()) 6175 RegUsage[ClassID] = 1; 6176 else 6177 RegUsage[ClassID] += 1; 6178 } 6179 } else { 6180 collectUniformsAndScalars(VFs[j]); 6181 for (auto Inst : OpenIntervals) { 6182 // Skip ignored values for VF > 1. 6183 if (VecValuesToIgnore.count(Inst)) 6184 continue; 6185 if (isScalarAfterVectorization(Inst, VFs[j])) { 6186 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6187 if (RegUsage.find(ClassID) == RegUsage.end()) 6188 RegUsage[ClassID] = 1; 6189 else 6190 RegUsage[ClassID] += 1; 6191 } else { 6192 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6193 if (RegUsage.find(ClassID) == RegUsage.end()) 6194 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6195 else 6196 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6197 } 6198 } 6199 } 6200 6201 for (auto& pair : RegUsage) { 6202 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6203 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6204 else 6205 MaxUsages[j][pair.first] = pair.second; 6206 } 6207 } 6208 6209 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6210 << OpenIntervals.size() << '\n'); 6211 6212 // Add the current instruction to the list of open intervals. 6213 OpenIntervals.insert(I); 6214 } 6215 6216 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6217 SmallMapVector<unsigned, unsigned, 4> Invariant; 6218 6219 for (auto Inst : LoopInvariants) { 6220 unsigned Usage = 6221 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6222 unsigned ClassID = 6223 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6224 if (Invariant.find(ClassID) == Invariant.end()) 6225 Invariant[ClassID] = Usage; 6226 else 6227 Invariant[ClassID] += Usage; 6228 } 6229 6230 LLVM_DEBUG({ 6231 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6232 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6233 << " item\n"; 6234 for (const auto &pair : MaxUsages[i]) { 6235 dbgs() << "LV(REG): RegisterClass: " 6236 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6237 << " registers\n"; 6238 } 6239 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6240 << " item\n"; 6241 for (const auto &pair : Invariant) { 6242 dbgs() << "LV(REG): RegisterClass: " 6243 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6244 << " registers\n"; 6245 } 6246 }); 6247 6248 RU.LoopInvariantRegs = Invariant; 6249 RU.MaxLocalUsers = MaxUsages[i]; 6250 RUs[i] = RU; 6251 } 6252 6253 return RUs; 6254 } 6255 6256 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6257 // TODO: Cost model for emulated masked load/store is completely 6258 // broken. This hack guides the cost model to use an artificially 6259 // high enough value to practically disable vectorization with such 6260 // operations, except where previously deployed legality hack allowed 6261 // using very low cost values. This is to avoid regressions coming simply 6262 // from moving "masked load/store" check from legality to cost model. 6263 // Masked Load/Gather emulation was previously never allowed. 6264 // Limited number of Masked Store/Scatter emulation was allowed. 6265 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6266 return isa<LoadInst>(I) || 6267 (isa<StoreInst>(I) && 6268 NumPredStores > NumberOfStoresToPredicate); 6269 } 6270 6271 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6272 // If we aren't vectorizing the loop, or if we've already collected the 6273 // instructions to scalarize, there's nothing to do. Collection may already 6274 // have occurred if we have a user-selected VF and are now computing the 6275 // expected cost for interleaving. 6276 if (VF.isScalar() || VF.isZero() || 6277 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6278 return; 6279 6280 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6281 // not profitable to scalarize any instructions, the presence of VF in the 6282 // map will indicate that we've analyzed it already. 6283 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6284 6285 // Find all the instructions that are scalar with predication in the loop and 6286 // determine if it would be better to not if-convert the blocks they are in. 6287 // If so, we also record the instructions to scalarize. 6288 for (BasicBlock *BB : TheLoop->blocks()) { 6289 if (!blockNeedsPredication(BB)) 6290 continue; 6291 for (Instruction &I : *BB) 6292 if (isScalarWithPredication(&I)) { 6293 ScalarCostsTy ScalarCosts; 6294 // Do not apply discount logic if hacked cost is needed 6295 // for emulated masked memrefs. 6296 if (!useEmulatedMaskMemRefHack(&I) && 6297 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6298 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6299 // Remember that BB will remain after vectorization. 6300 PredicatedBBsAfterVectorization.insert(BB); 6301 } 6302 } 6303 } 6304 6305 int LoopVectorizationCostModel::computePredInstDiscount( 6306 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6307 ElementCount VF) { 6308 assert(!isUniformAfterVectorization(PredInst, VF) && 6309 "Instruction marked uniform-after-vectorization will be predicated"); 6310 6311 // Initialize the discount to zero, meaning that the scalar version and the 6312 // vector version cost the same. 6313 int Discount = 0; 6314 6315 // Holds instructions to analyze. The instructions we visit are mapped in 6316 // ScalarCosts. Those instructions are the ones that would be scalarized if 6317 // we find that the scalar version costs less. 6318 SmallVector<Instruction *, 8> Worklist; 6319 6320 // Returns true if the given instruction can be scalarized. 6321 auto canBeScalarized = [&](Instruction *I) -> bool { 6322 // We only attempt to scalarize instructions forming a single-use chain 6323 // from the original predicated block that would otherwise be vectorized. 6324 // Although not strictly necessary, we give up on instructions we know will 6325 // already be scalar to avoid traversing chains that are unlikely to be 6326 // beneficial. 6327 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6328 isScalarAfterVectorization(I, VF)) 6329 return false; 6330 6331 // If the instruction is scalar with predication, it will be analyzed 6332 // separately. We ignore it within the context of PredInst. 6333 if (isScalarWithPredication(I)) 6334 return false; 6335 6336 // If any of the instruction's operands are uniform after vectorization, 6337 // the instruction cannot be scalarized. This prevents, for example, a 6338 // masked load from being scalarized. 6339 // 6340 // We assume we will only emit a value for lane zero of an instruction 6341 // marked uniform after vectorization, rather than VF identical values. 6342 // Thus, if we scalarize an instruction that uses a uniform, we would 6343 // create uses of values corresponding to the lanes we aren't emitting code 6344 // for. This behavior can be changed by allowing getScalarValue to clone 6345 // the lane zero values for uniforms rather than asserting. 6346 for (Use &U : I->operands()) 6347 if (auto *J = dyn_cast<Instruction>(U.get())) 6348 if (isUniformAfterVectorization(J, VF)) 6349 return false; 6350 6351 // Otherwise, we can scalarize the instruction. 6352 return true; 6353 }; 6354 6355 // Compute the expected cost discount from scalarizing the entire expression 6356 // feeding the predicated instruction. We currently only consider expressions 6357 // that are single-use instruction chains. 6358 Worklist.push_back(PredInst); 6359 while (!Worklist.empty()) { 6360 Instruction *I = Worklist.pop_back_val(); 6361 6362 // If we've already analyzed the instruction, there's nothing to do. 6363 if (ScalarCosts.find(I) != ScalarCosts.end()) 6364 continue; 6365 6366 // Compute the cost of the vector instruction. Note that this cost already 6367 // includes the scalarization overhead of the predicated instruction. 6368 unsigned VectorCost = getInstructionCost(I, VF).first; 6369 6370 // Compute the cost of the scalarized instruction. This cost is the cost of 6371 // the instruction as if it wasn't if-converted and instead remained in the 6372 // predicated block. We will scale this cost by block probability after 6373 // computing the scalarization overhead. 6374 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6375 unsigned ScalarCost = 6376 VF.getKnownMinValue() * 6377 getInstructionCost(I, ElementCount::getFixed(1)).first; 6378 6379 // Compute the scalarization overhead of needed insertelement instructions 6380 // and phi nodes. 6381 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6382 ScalarCost += TTI.getScalarizationOverhead( 6383 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6384 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6385 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6386 ScalarCost += 6387 VF.getKnownMinValue() * 6388 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6389 } 6390 6391 // Compute the scalarization overhead of needed extractelement 6392 // instructions. For each of the instruction's operands, if the operand can 6393 // be scalarized, add it to the worklist; otherwise, account for the 6394 // overhead. 6395 for (Use &U : I->operands()) 6396 if (auto *J = dyn_cast<Instruction>(U.get())) { 6397 assert(VectorType::isValidElementType(J->getType()) && 6398 "Instruction has non-scalar type"); 6399 if (canBeScalarized(J)) 6400 Worklist.push_back(J); 6401 else if (needsExtract(J, VF)) { 6402 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6403 ScalarCost += TTI.getScalarizationOverhead( 6404 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6405 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6406 } 6407 } 6408 6409 // Scale the total scalar cost by block probability. 6410 ScalarCost /= getReciprocalPredBlockProb(); 6411 6412 // Compute the discount. A non-negative discount means the vector version 6413 // of the instruction costs more, and scalarizing would be beneficial. 6414 Discount += VectorCost - ScalarCost; 6415 ScalarCosts[I] = ScalarCost; 6416 } 6417 6418 return Discount; 6419 } 6420 6421 LoopVectorizationCostModel::VectorizationCostTy 6422 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6423 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6424 VectorizationCostTy Cost; 6425 6426 // For each block. 6427 for (BasicBlock *BB : TheLoop->blocks()) { 6428 VectorizationCostTy BlockCost; 6429 6430 // For each instruction in the old loop. 6431 for (Instruction &I : BB->instructionsWithoutDebug()) { 6432 // Skip ignored values. 6433 if (ValuesToIgnore.count(&I) || 6434 (VF.isVector() && VecValuesToIgnore.count(&I))) 6435 continue; 6436 6437 VectorizationCostTy C = getInstructionCost(&I, VF); 6438 6439 // Check if we should override the cost. 6440 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6441 C.first = ForceTargetInstructionCost; 6442 6443 BlockCost.first += C.first; 6444 BlockCost.second |= C.second; 6445 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6446 << " for VF " << VF << " For instruction: " << I 6447 << '\n'); 6448 } 6449 6450 // If we are vectorizing a predicated block, it will have been 6451 // if-converted. This means that the block's instructions (aside from 6452 // stores and instructions that may divide by zero) will now be 6453 // unconditionally executed. For the scalar case, we may not always execute 6454 // the predicated block. Thus, scale the block's cost by the probability of 6455 // executing it. 6456 if (VF.isScalar() && blockNeedsPredication(BB)) 6457 BlockCost.first /= getReciprocalPredBlockProb(); 6458 6459 Cost.first += BlockCost.first; 6460 Cost.second |= BlockCost.second; 6461 } 6462 6463 return Cost; 6464 } 6465 6466 /// Gets Address Access SCEV after verifying that the access pattern 6467 /// is loop invariant except the induction variable dependence. 6468 /// 6469 /// This SCEV can be sent to the Target in order to estimate the address 6470 /// calculation cost. 6471 static const SCEV *getAddressAccessSCEV( 6472 Value *Ptr, 6473 LoopVectorizationLegality *Legal, 6474 PredicatedScalarEvolution &PSE, 6475 const Loop *TheLoop) { 6476 6477 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6478 if (!Gep) 6479 return nullptr; 6480 6481 // We are looking for a gep with all loop invariant indices except for one 6482 // which should be an induction variable. 6483 auto SE = PSE.getSE(); 6484 unsigned NumOperands = Gep->getNumOperands(); 6485 for (unsigned i = 1; i < NumOperands; ++i) { 6486 Value *Opd = Gep->getOperand(i); 6487 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6488 !Legal->isInductionVariable(Opd)) 6489 return nullptr; 6490 } 6491 6492 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6493 return PSE.getSCEV(Ptr); 6494 } 6495 6496 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6497 return Legal->hasStride(I->getOperand(0)) || 6498 Legal->hasStride(I->getOperand(1)); 6499 } 6500 6501 unsigned 6502 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6503 ElementCount VF) { 6504 assert(VF.isVector() && 6505 "Scalarization cost of instruction implies vectorization."); 6506 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6507 Type *ValTy = getMemInstValueType(I); 6508 auto SE = PSE.getSE(); 6509 6510 unsigned AS = getLoadStoreAddressSpace(I); 6511 Value *Ptr = getLoadStorePointerOperand(I); 6512 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6513 6514 // Figure out whether the access is strided and get the stride value 6515 // if it's known in compile time 6516 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6517 6518 // Get the cost of the scalar memory instruction and address computation. 6519 unsigned Cost = 6520 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6521 6522 // Don't pass *I here, since it is scalar but will actually be part of a 6523 // vectorized loop where the user of it is a vectorized instruction. 6524 const Align Alignment = getLoadStoreAlignment(I); 6525 Cost += VF.getKnownMinValue() * 6526 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6527 AS, TTI::TCK_RecipThroughput); 6528 6529 // Get the overhead of the extractelement and insertelement instructions 6530 // we might create due to scalarization. 6531 Cost += getScalarizationOverhead(I, VF); 6532 6533 // If we have a predicated store, it may not be executed for each vector 6534 // lane. Scale the cost by the probability of executing the predicated 6535 // block. 6536 if (isPredicatedInst(I)) { 6537 Cost /= getReciprocalPredBlockProb(); 6538 6539 if (useEmulatedMaskMemRefHack(I)) 6540 // Artificially setting to a high enough value to practically disable 6541 // vectorization with such operations. 6542 Cost = 3000000; 6543 } 6544 6545 return Cost; 6546 } 6547 6548 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6549 ElementCount VF) { 6550 Type *ValTy = getMemInstValueType(I); 6551 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6552 Value *Ptr = getLoadStorePointerOperand(I); 6553 unsigned AS = getLoadStoreAddressSpace(I); 6554 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6555 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6556 6557 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6558 "Stride should be 1 or -1 for consecutive memory access"); 6559 const Align Alignment = getLoadStoreAlignment(I); 6560 unsigned Cost = 0; 6561 if (Legal->isMaskRequired(I)) 6562 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6563 CostKind); 6564 else 6565 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6566 CostKind, I); 6567 6568 bool Reverse = ConsecutiveStride < 0; 6569 if (Reverse) 6570 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6571 return Cost; 6572 } 6573 6574 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6575 ElementCount VF) { 6576 assert(Legal->isUniformMemOp(*I)); 6577 6578 Type *ValTy = getMemInstValueType(I); 6579 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6580 const Align Alignment = getLoadStoreAlignment(I); 6581 unsigned AS = getLoadStoreAddressSpace(I); 6582 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6583 if (isa<LoadInst>(I)) { 6584 return TTI.getAddressComputationCost(ValTy) + 6585 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6586 CostKind) + 6587 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6588 } 6589 StoreInst *SI = cast<StoreInst>(I); 6590 6591 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6592 return TTI.getAddressComputationCost(ValTy) + 6593 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6594 CostKind) + 6595 (isLoopInvariantStoreValue 6596 ? 0 6597 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6598 VF.getKnownMinValue() - 1)); 6599 } 6600 6601 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6602 ElementCount VF) { 6603 Type *ValTy = getMemInstValueType(I); 6604 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6605 const Align Alignment = getLoadStoreAlignment(I); 6606 const Value *Ptr = getLoadStorePointerOperand(I); 6607 6608 return TTI.getAddressComputationCost(VectorTy) + 6609 TTI.getGatherScatterOpCost( 6610 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6611 TargetTransformInfo::TCK_RecipThroughput, I); 6612 } 6613 6614 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6615 ElementCount VF) { 6616 Type *ValTy = getMemInstValueType(I); 6617 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6618 unsigned AS = getLoadStoreAddressSpace(I); 6619 6620 auto Group = getInterleavedAccessGroup(I); 6621 assert(Group && "Fail to get an interleaved access group."); 6622 6623 unsigned InterleaveFactor = Group->getFactor(); 6624 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6625 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6626 6627 // Holds the indices of existing members in an interleaved load group. 6628 // An interleaved store group doesn't need this as it doesn't allow gaps. 6629 SmallVector<unsigned, 4> Indices; 6630 if (isa<LoadInst>(I)) { 6631 for (unsigned i = 0; i < InterleaveFactor; i++) 6632 if (Group->getMember(i)) 6633 Indices.push_back(i); 6634 } 6635 6636 // Calculate the cost of the whole interleaved group. 6637 bool UseMaskForGaps = 6638 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6639 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6640 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6641 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6642 6643 if (Group->isReverse()) { 6644 // TODO: Add support for reversed masked interleaved access. 6645 assert(!Legal->isMaskRequired(I) && 6646 "Reverse masked interleaved access not supported."); 6647 Cost += Group->getNumMembers() * 6648 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6649 } 6650 return Cost; 6651 } 6652 6653 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6654 ElementCount VF) { 6655 // Calculate scalar cost only. Vectorization cost should be ready at this 6656 // moment. 6657 if (VF.isScalar()) { 6658 Type *ValTy = getMemInstValueType(I); 6659 const Align Alignment = getLoadStoreAlignment(I); 6660 unsigned AS = getLoadStoreAddressSpace(I); 6661 6662 return TTI.getAddressComputationCost(ValTy) + 6663 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6664 TTI::TCK_RecipThroughput, I); 6665 } 6666 return getWideningCost(I, VF); 6667 } 6668 6669 LoopVectorizationCostModel::VectorizationCostTy 6670 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6671 ElementCount VF) { 6672 assert(!VF.isScalable() && 6673 "the cost model is not yet implemented for scalable vectorization"); 6674 // If we know that this instruction will remain uniform, check the cost of 6675 // the scalar version. 6676 if (isUniformAfterVectorization(I, VF)) 6677 VF = ElementCount::getFixed(1); 6678 6679 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6680 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6681 6682 // Forced scalars do not have any scalarization overhead. 6683 auto ForcedScalar = ForcedScalars.find(VF); 6684 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6685 auto InstSet = ForcedScalar->second; 6686 if (InstSet.count(I)) 6687 return VectorizationCostTy( 6688 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6689 VF.getKnownMinValue()), 6690 false); 6691 } 6692 6693 Type *VectorTy; 6694 unsigned C = getInstructionCost(I, VF, VectorTy); 6695 6696 bool TypeNotScalarized = 6697 VF.isVector() && VectorTy->isVectorTy() && 6698 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6699 return VectorizationCostTy(C, TypeNotScalarized); 6700 } 6701 6702 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6703 ElementCount VF) { 6704 6705 assert(!VF.isScalable() && 6706 "cannot compute scalarization overhead for scalable vectorization"); 6707 if (VF.isScalar()) 6708 return 0; 6709 6710 unsigned Cost = 0; 6711 Type *RetTy = ToVectorTy(I->getType(), VF); 6712 if (!RetTy->isVoidTy() && 6713 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6714 Cost += TTI.getScalarizationOverhead( 6715 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6716 true, false); 6717 6718 // Some targets keep addresses scalar. 6719 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6720 return Cost; 6721 6722 // Some targets support efficient element stores. 6723 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6724 return Cost; 6725 6726 // Collect operands to consider. 6727 CallInst *CI = dyn_cast<CallInst>(I); 6728 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6729 6730 // Skip operands that do not require extraction/scalarization and do not incur 6731 // any overhead. 6732 return Cost + TTI.getOperandsScalarizationOverhead( 6733 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6734 } 6735 6736 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6737 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6738 if (VF.isScalar()) 6739 return; 6740 NumPredStores = 0; 6741 for (BasicBlock *BB : TheLoop->blocks()) { 6742 // For each instruction in the old loop. 6743 for (Instruction &I : *BB) { 6744 Value *Ptr = getLoadStorePointerOperand(&I); 6745 if (!Ptr) 6746 continue; 6747 6748 // TODO: We should generate better code and update the cost model for 6749 // predicated uniform stores. Today they are treated as any other 6750 // predicated store (see added test cases in 6751 // invariant-store-vectorization.ll). 6752 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6753 NumPredStores++; 6754 6755 if (Legal->isUniformMemOp(I)) { 6756 // TODO: Avoid replicating loads and stores instead of 6757 // relying on instcombine to remove them. 6758 // Load: Scalar load + broadcast 6759 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6760 unsigned Cost = getUniformMemOpCost(&I, VF); 6761 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6762 continue; 6763 } 6764 6765 // We assume that widening is the best solution when possible. 6766 if (memoryInstructionCanBeWidened(&I, VF)) { 6767 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6768 int ConsecutiveStride = 6769 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6770 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6771 "Expected consecutive stride."); 6772 InstWidening Decision = 6773 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6774 setWideningDecision(&I, VF, Decision, Cost); 6775 continue; 6776 } 6777 6778 // Choose between Interleaving, Gather/Scatter or Scalarization. 6779 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6780 unsigned NumAccesses = 1; 6781 if (isAccessInterleaved(&I)) { 6782 auto Group = getInterleavedAccessGroup(&I); 6783 assert(Group && "Fail to get an interleaved access group."); 6784 6785 // Make one decision for the whole group. 6786 if (getWideningDecision(&I, VF) != CM_Unknown) 6787 continue; 6788 6789 NumAccesses = Group->getNumMembers(); 6790 if (interleavedAccessCanBeWidened(&I, VF)) 6791 InterleaveCost = getInterleaveGroupCost(&I, VF); 6792 } 6793 6794 unsigned GatherScatterCost = 6795 isLegalGatherOrScatter(&I) 6796 ? getGatherScatterCost(&I, VF) * NumAccesses 6797 : std::numeric_limits<unsigned>::max(); 6798 6799 unsigned ScalarizationCost = 6800 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6801 6802 // Choose better solution for the current VF, 6803 // write down this decision and use it during vectorization. 6804 unsigned Cost; 6805 InstWidening Decision; 6806 if (InterleaveCost <= GatherScatterCost && 6807 InterleaveCost < ScalarizationCost) { 6808 Decision = CM_Interleave; 6809 Cost = InterleaveCost; 6810 } else if (GatherScatterCost < ScalarizationCost) { 6811 Decision = CM_GatherScatter; 6812 Cost = GatherScatterCost; 6813 } else { 6814 Decision = CM_Scalarize; 6815 Cost = ScalarizationCost; 6816 } 6817 // If the instructions belongs to an interleave group, the whole group 6818 // receives the same decision. The whole group receives the cost, but 6819 // the cost will actually be assigned to one instruction. 6820 if (auto Group = getInterleavedAccessGroup(&I)) 6821 setWideningDecision(Group, VF, Decision, Cost); 6822 else 6823 setWideningDecision(&I, VF, Decision, Cost); 6824 } 6825 } 6826 6827 // Make sure that any load of address and any other address computation 6828 // remains scalar unless there is gather/scatter support. This avoids 6829 // inevitable extracts into address registers, and also has the benefit of 6830 // activating LSR more, since that pass can't optimize vectorized 6831 // addresses. 6832 if (TTI.prefersVectorizedAddressing()) 6833 return; 6834 6835 // Start with all scalar pointer uses. 6836 SmallPtrSet<Instruction *, 8> AddrDefs; 6837 for (BasicBlock *BB : TheLoop->blocks()) 6838 for (Instruction &I : *BB) { 6839 Instruction *PtrDef = 6840 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6841 if (PtrDef && TheLoop->contains(PtrDef) && 6842 getWideningDecision(&I, VF) != CM_GatherScatter) 6843 AddrDefs.insert(PtrDef); 6844 } 6845 6846 // Add all instructions used to generate the addresses. 6847 SmallVector<Instruction *, 4> Worklist; 6848 for (auto *I : AddrDefs) 6849 Worklist.push_back(I); 6850 while (!Worklist.empty()) { 6851 Instruction *I = Worklist.pop_back_val(); 6852 for (auto &Op : I->operands()) 6853 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6854 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6855 AddrDefs.insert(InstOp).second) 6856 Worklist.push_back(InstOp); 6857 } 6858 6859 for (auto *I : AddrDefs) { 6860 if (isa<LoadInst>(I)) { 6861 // Setting the desired widening decision should ideally be handled in 6862 // by cost functions, but since this involves the task of finding out 6863 // if the loaded register is involved in an address computation, it is 6864 // instead changed here when we know this is the case. 6865 InstWidening Decision = getWideningDecision(I, VF); 6866 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6867 // Scalarize a widened load of address. 6868 setWideningDecision( 6869 I, VF, CM_Scalarize, 6870 (VF.getKnownMinValue() * 6871 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6872 else if (auto Group = getInterleavedAccessGroup(I)) { 6873 // Scalarize an interleave group of address loads. 6874 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6875 if (Instruction *Member = Group->getMember(I)) 6876 setWideningDecision( 6877 Member, VF, CM_Scalarize, 6878 (VF.getKnownMinValue() * 6879 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6880 } 6881 } 6882 } else 6883 // Make sure I gets scalarized and a cost estimate without 6884 // scalarization overhead. 6885 ForcedScalars[VF].insert(I); 6886 } 6887 } 6888 6889 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6890 ElementCount VF, 6891 Type *&VectorTy) { 6892 Type *RetTy = I->getType(); 6893 if (canTruncateToMinimalBitwidth(I, VF)) 6894 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6895 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6896 auto SE = PSE.getSE(); 6897 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6898 6899 // TODO: We need to estimate the cost of intrinsic calls. 6900 switch (I->getOpcode()) { 6901 case Instruction::GetElementPtr: 6902 // We mark this instruction as zero-cost because the cost of GEPs in 6903 // vectorized code depends on whether the corresponding memory instruction 6904 // is scalarized or not. Therefore, we handle GEPs with the memory 6905 // instruction cost. 6906 return 0; 6907 case Instruction::Br: { 6908 // In cases of scalarized and predicated instructions, there will be VF 6909 // predicated blocks in the vectorized loop. Each branch around these 6910 // blocks requires also an extract of its vector compare i1 element. 6911 bool ScalarPredicatedBB = false; 6912 BranchInst *BI = cast<BranchInst>(I); 6913 if (VF.isVector() && BI->isConditional() && 6914 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6915 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6916 ScalarPredicatedBB = true; 6917 6918 if (ScalarPredicatedBB) { 6919 // Return cost for branches around scalarized and predicated blocks. 6920 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6921 auto *Vec_i1Ty = 6922 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6923 return (TTI.getScalarizationOverhead( 6924 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6925 false, true) + 6926 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6927 VF.getKnownMinValue())); 6928 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6929 // The back-edge branch will remain, as will all scalar branches. 6930 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6931 else 6932 // This branch will be eliminated by if-conversion. 6933 return 0; 6934 // Note: We currently assume zero cost for an unconditional branch inside 6935 // a predicated block since it will become a fall-through, although we 6936 // may decide in the future to call TTI for all branches. 6937 } 6938 case Instruction::PHI: { 6939 auto *Phi = cast<PHINode>(I); 6940 6941 // First-order recurrences are replaced by vector shuffles inside the loop. 6942 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6943 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6944 return TTI.getShuffleCost( 6945 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6946 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6947 6948 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6949 // converted into select instructions. We require N - 1 selects per phi 6950 // node, where N is the number of incoming values. 6951 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6952 return (Phi->getNumIncomingValues() - 1) * 6953 TTI.getCmpSelInstrCost( 6954 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6955 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6956 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6957 6958 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6959 } 6960 case Instruction::UDiv: 6961 case Instruction::SDiv: 6962 case Instruction::URem: 6963 case Instruction::SRem: 6964 // If we have a predicated instruction, it may not be executed for each 6965 // vector lane. Get the scalarization cost and scale this amount by the 6966 // probability of executing the predicated block. If the instruction is not 6967 // predicated, we fall through to the next case. 6968 if (VF.isVector() && isScalarWithPredication(I)) { 6969 unsigned Cost = 0; 6970 6971 // These instructions have a non-void type, so account for the phi nodes 6972 // that we will create. This cost is likely to be zero. The phi node 6973 // cost, if any, should be scaled by the block probability because it 6974 // models a copy at the end of each predicated block. 6975 Cost += VF.getKnownMinValue() * 6976 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6977 6978 // The cost of the non-predicated instruction. 6979 Cost += VF.getKnownMinValue() * 6980 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6981 6982 // The cost of insertelement and extractelement instructions needed for 6983 // scalarization. 6984 Cost += getScalarizationOverhead(I, VF); 6985 6986 // Scale the cost by the probability of executing the predicated blocks. 6987 // This assumes the predicated block for each vector lane is equally 6988 // likely. 6989 return Cost / getReciprocalPredBlockProb(); 6990 } 6991 LLVM_FALLTHROUGH; 6992 case Instruction::Add: 6993 case Instruction::FAdd: 6994 case Instruction::Sub: 6995 case Instruction::FSub: 6996 case Instruction::Mul: 6997 case Instruction::FMul: 6998 case Instruction::FDiv: 6999 case Instruction::FRem: 7000 case Instruction::Shl: 7001 case Instruction::LShr: 7002 case Instruction::AShr: 7003 case Instruction::And: 7004 case Instruction::Or: 7005 case Instruction::Xor: { 7006 // Since we will replace the stride by 1 the multiplication should go away. 7007 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7008 return 0; 7009 // Certain instructions can be cheaper to vectorize if they have a constant 7010 // second vector operand. One example of this are shifts on x86. 7011 Value *Op2 = I->getOperand(1); 7012 TargetTransformInfo::OperandValueProperties Op2VP; 7013 TargetTransformInfo::OperandValueKind Op2VK = 7014 TTI.getOperandInfo(Op2, Op2VP); 7015 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7016 Op2VK = TargetTransformInfo::OK_UniformValue; 7017 7018 SmallVector<const Value *, 4> Operands(I->operand_values()); 7019 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7020 return N * TTI.getArithmeticInstrCost( 7021 I->getOpcode(), VectorTy, CostKind, 7022 TargetTransformInfo::OK_AnyValue, 7023 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7024 } 7025 case Instruction::FNeg: { 7026 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7027 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7028 return N * TTI.getArithmeticInstrCost( 7029 I->getOpcode(), VectorTy, CostKind, 7030 TargetTransformInfo::OK_AnyValue, 7031 TargetTransformInfo::OK_AnyValue, 7032 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7033 I->getOperand(0), I); 7034 } 7035 case Instruction::Select: { 7036 SelectInst *SI = cast<SelectInst>(I); 7037 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7038 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7039 Type *CondTy = SI->getCondition()->getType(); 7040 if (!ScalarCond) { 7041 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7042 CondTy = VectorType::get(CondTy, VF); 7043 } 7044 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7045 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7046 } 7047 case Instruction::ICmp: 7048 case Instruction::FCmp: { 7049 Type *ValTy = I->getOperand(0)->getType(); 7050 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7051 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7052 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7053 VectorTy = ToVectorTy(ValTy, VF); 7054 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7055 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7056 } 7057 case Instruction::Store: 7058 case Instruction::Load: { 7059 ElementCount Width = VF; 7060 if (Width.isVector()) { 7061 InstWidening Decision = getWideningDecision(I, Width); 7062 assert(Decision != CM_Unknown && 7063 "CM decision should be taken at this point"); 7064 if (Decision == CM_Scalarize) 7065 Width = ElementCount::getFixed(1); 7066 } 7067 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7068 return getMemoryInstructionCost(I, VF); 7069 } 7070 case Instruction::ZExt: 7071 case Instruction::SExt: 7072 case Instruction::FPToUI: 7073 case Instruction::FPToSI: 7074 case Instruction::FPExt: 7075 case Instruction::PtrToInt: 7076 case Instruction::IntToPtr: 7077 case Instruction::SIToFP: 7078 case Instruction::UIToFP: 7079 case Instruction::Trunc: 7080 case Instruction::FPTrunc: 7081 case Instruction::BitCast: { 7082 // Computes the CastContextHint from a Load/Store instruction. 7083 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7084 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7085 "Expected a load or a store!"); 7086 7087 if (VF.isScalar() || !TheLoop->contains(I)) 7088 return TTI::CastContextHint::Normal; 7089 7090 switch (getWideningDecision(I, VF)) { 7091 case LoopVectorizationCostModel::CM_GatherScatter: 7092 return TTI::CastContextHint::GatherScatter; 7093 case LoopVectorizationCostModel::CM_Interleave: 7094 return TTI::CastContextHint::Interleave; 7095 case LoopVectorizationCostModel::CM_Scalarize: 7096 case LoopVectorizationCostModel::CM_Widen: 7097 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7098 : TTI::CastContextHint::Normal; 7099 case LoopVectorizationCostModel::CM_Widen_Reverse: 7100 return TTI::CastContextHint::Reversed; 7101 case LoopVectorizationCostModel::CM_Unknown: 7102 llvm_unreachable("Instr did not go through cost modelling?"); 7103 } 7104 7105 llvm_unreachable("Unhandled case!"); 7106 }; 7107 7108 unsigned Opcode = I->getOpcode(); 7109 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7110 // For Trunc, the context is the only user, which must be a StoreInst. 7111 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7112 if (I->hasOneUse()) 7113 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7114 CCH = ComputeCCH(Store); 7115 } 7116 // For Z/Sext, the context is the operand, which must be a LoadInst. 7117 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7118 Opcode == Instruction::FPExt) { 7119 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7120 CCH = ComputeCCH(Load); 7121 } 7122 7123 // We optimize the truncation of induction variables having constant 7124 // integer steps. The cost of these truncations is the same as the scalar 7125 // operation. 7126 if (isOptimizableIVTruncate(I, VF)) { 7127 auto *Trunc = cast<TruncInst>(I); 7128 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7129 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7130 } 7131 7132 Type *SrcScalarTy = I->getOperand(0)->getType(); 7133 Type *SrcVecTy = 7134 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7135 if (canTruncateToMinimalBitwidth(I, VF)) { 7136 // This cast is going to be shrunk. This may remove the cast or it might 7137 // turn it into slightly different cast. For example, if MinBW == 16, 7138 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7139 // 7140 // Calculate the modified src and dest types. 7141 Type *MinVecTy = VectorTy; 7142 if (Opcode == Instruction::Trunc) { 7143 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7144 VectorTy = 7145 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7146 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7147 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7148 VectorTy = 7149 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7150 } 7151 } 7152 7153 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7154 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7155 return N * 7156 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7157 } 7158 case Instruction::Call: { 7159 bool NeedToScalarize; 7160 CallInst *CI = cast<CallInst>(I); 7161 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7162 if (getVectorIntrinsicIDForCall(CI, TLI)) 7163 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7164 return CallCost; 7165 } 7166 case Instruction::ExtractValue: 7167 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7168 default: 7169 // The cost of executing VF copies of the scalar instruction. This opcode 7170 // is unknown. Assume that it is the same as 'mul'. 7171 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7172 Instruction::Mul, VectorTy, CostKind) + 7173 getScalarizationOverhead(I, VF); 7174 } // end of switch. 7175 } 7176 7177 char LoopVectorize::ID = 0; 7178 7179 static const char lv_name[] = "Loop Vectorization"; 7180 7181 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7182 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7183 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7184 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7185 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7186 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7187 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7188 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7189 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7190 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7191 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7192 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7193 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7194 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7195 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7196 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7197 7198 namespace llvm { 7199 7200 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7201 7202 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7203 bool VectorizeOnlyWhenForced) { 7204 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7205 } 7206 7207 } // end namespace llvm 7208 7209 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7210 // Check if the pointer operand of a load or store instruction is 7211 // consecutive. 7212 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7213 return Legal->isConsecutivePtr(Ptr); 7214 return false; 7215 } 7216 7217 void LoopVectorizationCostModel::collectValuesToIgnore() { 7218 // Ignore ephemeral values. 7219 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7220 7221 // Ignore type-promoting instructions we identified during reduction 7222 // detection. 7223 for (auto &Reduction : Legal->getReductionVars()) { 7224 RecurrenceDescriptor &RedDes = Reduction.second; 7225 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7226 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7227 } 7228 // Ignore type-casting instructions we identified during induction 7229 // detection. 7230 for (auto &Induction : Legal->getInductionVars()) { 7231 InductionDescriptor &IndDes = Induction.second; 7232 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7233 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7234 } 7235 } 7236 7237 void LoopVectorizationCostModel::collectInLoopReductions() { 7238 for (auto &Reduction : Legal->getReductionVars()) { 7239 PHINode *Phi = Reduction.first; 7240 RecurrenceDescriptor &RdxDesc = Reduction.second; 7241 7242 // We don't collect reductions that are type promoted (yet). 7243 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7244 continue; 7245 7246 // If the target would prefer this reduction to happen "in-loop", then we 7247 // want to record it as such. 7248 unsigned Opcode = RdxDesc.getRecurrenceBinOp(); 7249 if (!PreferInLoopReductions && 7250 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7251 TargetTransformInfo::ReductionFlags())) 7252 continue; 7253 7254 // Check that we can correctly put the reductions into the loop, by 7255 // finding the chain of operations that leads from the phi to the loop 7256 // exit value. 7257 SmallVector<Instruction *, 4> ReductionOperations = 7258 RdxDesc.getReductionOpChain(Phi, TheLoop); 7259 bool InLoop = !ReductionOperations.empty(); 7260 if (InLoop) 7261 InLoopReductionChains[Phi] = ReductionOperations; 7262 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7263 << " reduction for phi: " << *Phi << "\n"); 7264 } 7265 } 7266 7267 // TODO: we could return a pair of values that specify the max VF and 7268 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7269 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7270 // doesn't have a cost model that can choose which plan to execute if 7271 // more than one is generated. 7272 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7273 LoopVectorizationCostModel &CM) { 7274 unsigned WidestType; 7275 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7276 return WidestVectorRegBits / WidestType; 7277 } 7278 7279 VectorizationFactor 7280 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7281 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7282 ElementCount VF = UserVF; 7283 // Outer loop handling: They may require CFG and instruction level 7284 // transformations before even evaluating whether vectorization is profitable. 7285 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7286 // the vectorization pipeline. 7287 if (!OrigLoop->isInnermost()) { 7288 // If the user doesn't provide a vectorization factor, determine a 7289 // reasonable one. 7290 if (UserVF.isZero()) { 7291 VF = ElementCount::getFixed( 7292 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7293 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7294 7295 // Make sure we have a VF > 1 for stress testing. 7296 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7297 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7298 << "overriding computed VF.\n"); 7299 VF = ElementCount::getFixed(4); 7300 } 7301 } 7302 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7303 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7304 "VF needs to be a power of two"); 7305 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7306 << "VF " << VF << " to build VPlans.\n"); 7307 buildVPlans(VF, VF); 7308 7309 // For VPlan build stress testing, we bail out after VPlan construction. 7310 if (VPlanBuildStressTest) 7311 return VectorizationFactor::Disabled(); 7312 7313 return {VF, 0 /*Cost*/}; 7314 } 7315 7316 LLVM_DEBUG( 7317 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7318 "VPlan-native path.\n"); 7319 return VectorizationFactor::Disabled(); 7320 } 7321 7322 Optional<VectorizationFactor> 7323 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7324 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 7325 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7326 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7327 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7328 return None; 7329 7330 // Invalidate interleave groups if all blocks of loop will be predicated. 7331 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7332 !useMaskedInterleavedAccesses(*TTI)) { 7333 LLVM_DEBUG( 7334 dbgs() 7335 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7336 "which requires masked-interleaved support.\n"); 7337 if (CM.InterleaveInfo.invalidateGroups()) 7338 // Invalidating interleave groups also requires invalidating all decisions 7339 // based on them, which includes widening decisions and uniform and scalar 7340 // values. 7341 CM.invalidateCostModelingDecisions(); 7342 } 7343 7344 ElementCount MaxVF = MaybeMaxVF.getValue(); 7345 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7346 7347 if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) { 7348 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7349 assert(isPowerOf2_32(UserVF.getFixedValue()) && 7350 "VF needs to be a power of two"); 7351 // Collect the instructions (and their associated costs) that will be more 7352 // profitable to scalarize. 7353 CM.selectUserVectorizationFactor(UserVF); 7354 CM.collectInLoopReductions(); 7355 buildVPlansWithVPRecipes(UserVF, UserVF); 7356 LLVM_DEBUG(printPlans(dbgs())); 7357 return {{UserVF, 0}}; 7358 } 7359 7360 for (ElementCount VF = ElementCount::getFixed(1); 7361 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7362 // Collect Uniform and Scalar instructions after vectorization with VF. 7363 CM.collectUniformsAndScalars(VF); 7364 7365 // Collect the instructions (and their associated costs) that will be more 7366 // profitable to scalarize. 7367 if (VF.isVector()) 7368 CM.collectInstsToScalarize(VF); 7369 } 7370 7371 CM.collectInLoopReductions(); 7372 7373 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7374 LLVM_DEBUG(printPlans(dbgs())); 7375 if (MaxVF.isScalar()) 7376 return VectorizationFactor::Disabled(); 7377 7378 // Select the optimal vectorization factor. 7379 return CM.selectVectorizationFactor(MaxVF); 7380 } 7381 7382 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7383 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7384 << '\n'); 7385 BestVF = VF; 7386 BestUF = UF; 7387 7388 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7389 return !Plan->hasVF(VF); 7390 }); 7391 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7392 } 7393 7394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7395 DominatorTree *DT) { 7396 // Perform the actual loop transformation. 7397 7398 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7399 VPCallbackILV CallbackILV(ILV); 7400 7401 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7402 7403 VPTransformState State{*BestVF, BestUF, LI, 7404 DT, ILV.Builder, ILV.VectorLoopValueMap, 7405 &ILV, CallbackILV}; 7406 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7407 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7408 State.CanonicalIV = ILV.Induction; 7409 7410 ILV.printDebugTracesAtStart(); 7411 7412 //===------------------------------------------------===// 7413 // 7414 // Notice: any optimization or new instruction that go 7415 // into the code below should also be implemented in 7416 // the cost-model. 7417 // 7418 //===------------------------------------------------===// 7419 7420 // 2. Copy and widen instructions from the old loop into the new loop. 7421 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7422 VPlans.front()->execute(&State); 7423 7424 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7425 // predication, updating analyses. 7426 ILV.fixVectorizedLoop(); 7427 7428 ILV.printDebugTracesAtEnd(); 7429 } 7430 7431 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7432 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7433 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7434 7435 // We create new control-flow for the vectorized loop, so the original 7436 // condition will be dead after vectorization if it's only used by the 7437 // branch. 7438 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7439 if (Cmp && Cmp->hasOneUse()) { 7440 DeadInstructions.insert(Cmp); 7441 7442 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7443 for (Value *Op : Cmp->operands()) { 7444 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7445 DeadInstructions.insert(cast<Instruction>(Op)); 7446 } 7447 } 7448 7449 // We create new "steps" for induction variable updates to which the original 7450 // induction variables map. An original update instruction will be dead if 7451 // all its users except the induction variable are dead. 7452 for (auto &Induction : Legal->getInductionVars()) { 7453 PHINode *Ind = Induction.first; 7454 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7455 7456 // If the tail is to be folded by masking, the primary induction variable, 7457 // if exists, isn't dead: it will be used for masking. Don't kill it. 7458 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7459 continue; 7460 7461 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7462 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7463 })) 7464 DeadInstructions.insert(IndUpdate); 7465 7466 // We record as "Dead" also the type-casting instructions we had identified 7467 // during induction analysis. We don't need any handling for them in the 7468 // vectorized loop because we have proven that, under a proper runtime 7469 // test guarding the vectorized loop, the value of the phi, and the casted 7470 // value of the phi, are the same. The last instruction in this casting chain 7471 // will get its scalar/vector/widened def from the scalar/vector/widened def 7472 // of the respective phi node. Any other casts in the induction def-use chain 7473 // have no other uses outside the phi update chain, and will be ignored. 7474 InductionDescriptor &IndDes = Induction.second; 7475 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7476 DeadInstructions.insert(Casts.begin(), Casts.end()); 7477 } 7478 } 7479 7480 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7481 7482 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7483 7484 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7485 Instruction::BinaryOps BinOp) { 7486 // When unrolling and the VF is 1, we only need to add a simple scalar. 7487 Type *Ty = Val->getType(); 7488 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7489 7490 if (Ty->isFloatingPointTy()) { 7491 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7492 7493 // Floating point operations had to be 'fast' to enable the unrolling. 7494 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7495 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7496 } 7497 Constant *C = ConstantInt::get(Ty, StartIdx); 7498 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7499 } 7500 7501 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7502 SmallVector<Metadata *, 4> MDs; 7503 // Reserve first location for self reference to the LoopID metadata node. 7504 MDs.push_back(nullptr); 7505 bool IsUnrollMetadata = false; 7506 MDNode *LoopID = L->getLoopID(); 7507 if (LoopID) { 7508 // First find existing loop unrolling disable metadata. 7509 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7510 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7511 if (MD) { 7512 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7513 IsUnrollMetadata = 7514 S && S->getString().startswith("llvm.loop.unroll.disable"); 7515 } 7516 MDs.push_back(LoopID->getOperand(i)); 7517 } 7518 } 7519 7520 if (!IsUnrollMetadata) { 7521 // Add runtime unroll disable metadata. 7522 LLVMContext &Context = L->getHeader()->getContext(); 7523 SmallVector<Metadata *, 1> DisableOperands; 7524 DisableOperands.push_back( 7525 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7526 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7527 MDs.push_back(DisableNode); 7528 MDNode *NewLoopID = MDNode::get(Context, MDs); 7529 // Set operand 0 to refer to the loop id itself. 7530 NewLoopID->replaceOperandWith(0, NewLoopID); 7531 L->setLoopID(NewLoopID); 7532 } 7533 } 7534 7535 //===--------------------------------------------------------------------===// 7536 // EpilogueVectorizerMainLoop 7537 //===--------------------------------------------------------------------===// 7538 7539 /// This function is partially responsible for generating the control flow 7540 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7541 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7542 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7543 Loop *Lp = createVectorLoopSkeleton(""); 7544 7545 // Generate the code to check the minimum iteration count of the vector 7546 // epilogue (see below). 7547 EPI.EpilogueIterationCountCheck = 7548 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7549 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7550 7551 // Generate the code to check any assumptions that we've made for SCEV 7552 // expressions. 7553 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7554 emitSCEVChecks(Lp, LoopScalarPreHeader); 7555 7556 // If a safety check was generated save it. 7557 if (SavedPreHeader != LoopVectorPreHeader) 7558 EPI.SCEVSafetyCheck = SavedPreHeader; 7559 7560 // Generate the code that checks at runtime if arrays overlap. We put the 7561 // checks into a separate block to make the more common case of few elements 7562 // faster. 7563 SavedPreHeader = LoopVectorPreHeader; 7564 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7565 7566 // If a safety check was generated save/overwite it. 7567 if (SavedPreHeader != LoopVectorPreHeader) 7568 EPI.MemSafetyCheck = SavedPreHeader; 7569 7570 // Generate the iteration count check for the main loop, *after* the check 7571 // for the epilogue loop, so that the path-length is shorter for the case 7572 // that goes directly through the vector epilogue. The longer-path length for 7573 // the main loop is compensated for, by the gain from vectorizing the larger 7574 // trip count. Note: the branch will get updated later on when we vectorize 7575 // the epilogue. 7576 EPI.MainLoopIterationCountCheck = 7577 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7578 7579 // Generate the induction variable. 7580 OldInduction = Legal->getPrimaryInduction(); 7581 Type *IdxTy = Legal->getWidestInductionType(); 7582 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7583 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7584 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7585 EPI.VectorTripCount = CountRoundDown; 7586 Induction = 7587 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7588 getDebugLocFromInstOrOperands(OldInduction)); 7589 7590 // Skip induction resume value creation here because they will be created in 7591 // the second pass. If we created them here, they wouldn't be used anyway, 7592 // because the vplan in the second pass still contains the inductions from the 7593 // original loop. 7594 7595 return completeLoopSkeleton(Lp, OrigLoopID); 7596 } 7597 7598 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7599 LLVM_DEBUG({ 7600 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7601 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7602 << ", Main Loop UF:" << EPI.MainLoopUF 7603 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7604 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7605 }); 7606 } 7607 7608 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7609 DEBUG_WITH_TYPE(VerboseDebug, { 7610 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7611 }); 7612 } 7613 7614 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7615 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7616 assert(L && "Expected valid Loop."); 7617 assert(Bypass && "Expected valid bypass basic block."); 7618 unsigned VFactor = 7619 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7620 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7621 Value *Count = getOrCreateTripCount(L); 7622 // Reuse existing vector loop preheader for TC checks. 7623 // Note that new preheader block is generated for vector loop. 7624 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7625 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7626 7627 // Generate code to check if the loop's trip count is less than VF * UF of the 7628 // main vector loop. 7629 auto P = 7630 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7631 7632 Value *CheckMinIters = Builder.CreateICmp( 7633 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7634 "min.iters.check"); 7635 7636 if (!ForEpilogue) 7637 TCCheckBlock->setName("vector.main.loop.iter.check"); 7638 7639 // Create new preheader for vector loop. 7640 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7641 DT, LI, nullptr, "vector.ph"); 7642 7643 if (ForEpilogue) { 7644 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7645 DT->getNode(Bypass)->getIDom()) && 7646 "TC check is expected to dominate Bypass"); 7647 7648 // Update dominator for Bypass & LoopExit. 7649 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7650 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7651 7652 LoopBypassBlocks.push_back(TCCheckBlock); 7653 7654 // Save the trip count so we don't have to regenerate it in the 7655 // vec.epilog.iter.check. This is safe to do because the trip count 7656 // generated here dominates the vector epilog iter check. 7657 EPI.TripCount = Count; 7658 } 7659 7660 ReplaceInstWithInst( 7661 TCCheckBlock->getTerminator(), 7662 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7663 7664 return TCCheckBlock; 7665 } 7666 7667 //===--------------------------------------------------------------------===// 7668 // EpilogueVectorizerEpilogueLoop 7669 //===--------------------------------------------------------------------===// 7670 7671 /// This function is partially responsible for generating the control flow 7672 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7673 BasicBlock * 7674 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7675 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7676 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7677 7678 // Now, compare the remaining count and if there aren't enough iterations to 7679 // execute the vectorized epilogue skip to the scalar part. 7680 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7681 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7682 LoopVectorPreHeader = 7683 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7684 LI, nullptr, "vec.epilog.ph"); 7685 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7686 VecEpilogueIterationCountCheck); 7687 7688 // Adjust the control flow taking the state info from the main loop 7689 // vectorization into account. 7690 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7691 "expected this to be saved from the previous pass."); 7692 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7693 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7694 7695 DT->changeImmediateDominator(LoopVectorPreHeader, 7696 EPI.MainLoopIterationCountCheck); 7697 7698 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7699 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7700 7701 if (EPI.SCEVSafetyCheck) 7702 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7703 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7704 if (EPI.MemSafetyCheck) 7705 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7706 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7707 7708 DT->changeImmediateDominator( 7709 VecEpilogueIterationCountCheck, 7710 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7711 7712 DT->changeImmediateDominator(LoopScalarPreHeader, 7713 EPI.EpilogueIterationCountCheck); 7714 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7715 7716 // Keep track of bypass blocks, as they feed start values to the induction 7717 // phis in the scalar loop preheader. 7718 if (EPI.SCEVSafetyCheck) 7719 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7720 if (EPI.MemSafetyCheck) 7721 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7722 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7723 7724 // Generate a resume induction for the vector epilogue and put it in the 7725 // vector epilogue preheader 7726 Type *IdxTy = Legal->getWidestInductionType(); 7727 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7728 LoopVectorPreHeader->getFirstNonPHI()); 7729 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7730 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7731 EPI.MainLoopIterationCountCheck); 7732 7733 // Generate the induction variable. 7734 OldInduction = Legal->getPrimaryInduction(); 7735 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7736 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7737 Value *StartIdx = EPResumeVal; 7738 Induction = 7739 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7740 getDebugLocFromInstOrOperands(OldInduction)); 7741 7742 // Generate induction resume values. These variables save the new starting 7743 // indexes for the scalar loop. They are used to test if there are any tail 7744 // iterations left once the vector loop has completed. 7745 // Note that when the vectorized epilogue is skipped due to iteration count 7746 // check, then the resume value for the induction variable comes from 7747 // the trip count of the main vector loop, hence passing the AdditionalBypass 7748 // argument. 7749 createInductionResumeValues(Lp, CountRoundDown, 7750 {VecEpilogueIterationCountCheck, 7751 EPI.VectorTripCount} /* AdditionalBypass */); 7752 7753 AddRuntimeUnrollDisableMetaData(Lp); 7754 return completeLoopSkeleton(Lp, OrigLoopID); 7755 } 7756 7757 BasicBlock * 7758 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7759 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7760 7761 assert(EPI.TripCount && 7762 "Expected trip count to have been safed in the first pass."); 7763 assert( 7764 (!isa<Instruction>(EPI.TripCount) || 7765 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7766 "saved trip count does not dominate insertion point."); 7767 Value *TC = EPI.TripCount; 7768 IRBuilder<> Builder(Insert->getTerminator()); 7769 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7770 7771 // Generate code to check if the loop's trip count is less than VF * UF of the 7772 // vector epilogue loop. 7773 auto P = 7774 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7775 7776 Value *CheckMinIters = Builder.CreateICmp( 7777 P, Count, 7778 ConstantInt::get(Count->getType(), 7779 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7780 "min.epilog.iters.check"); 7781 7782 ReplaceInstWithInst( 7783 Insert->getTerminator(), 7784 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7785 7786 LoopBypassBlocks.push_back(Insert); 7787 return Insert; 7788 } 7789 7790 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7791 LLVM_DEBUG({ 7792 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7793 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7794 << ", Main Loop UF:" << EPI.MainLoopUF 7795 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7796 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7797 }); 7798 } 7799 7800 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7801 DEBUG_WITH_TYPE(VerboseDebug, { 7802 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7803 }); 7804 } 7805 7806 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7807 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7808 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7809 bool PredicateAtRangeStart = Predicate(Range.Start); 7810 7811 for (ElementCount TmpVF = Range.Start * 2; 7812 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7813 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7814 Range.End = TmpVF; 7815 break; 7816 } 7817 7818 return PredicateAtRangeStart; 7819 } 7820 7821 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7822 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7823 /// of VF's starting at a given VF and extending it as much as possible. Each 7824 /// vectorization decision can potentially shorten this sub-range during 7825 /// buildVPlan(). 7826 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7827 ElementCount MaxVF) { 7828 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7829 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7830 VFRange SubRange = {VF, MaxVFPlusOne}; 7831 VPlans.push_back(buildVPlan(SubRange)); 7832 VF = SubRange.End; 7833 } 7834 } 7835 7836 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7837 VPlanPtr &Plan) { 7838 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7839 7840 // Look for cached value. 7841 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7842 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7843 if (ECEntryIt != EdgeMaskCache.end()) 7844 return ECEntryIt->second; 7845 7846 VPValue *SrcMask = createBlockInMask(Src, Plan); 7847 7848 // The terminator has to be a branch inst! 7849 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7850 assert(BI && "Unexpected terminator found"); 7851 7852 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7853 return EdgeMaskCache[Edge] = SrcMask; 7854 7855 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7856 assert(EdgeMask && "No Edge Mask found for condition"); 7857 7858 if (BI->getSuccessor(0) != Dst) 7859 EdgeMask = Builder.createNot(EdgeMask); 7860 7861 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7862 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7863 7864 return EdgeMaskCache[Edge] = EdgeMask; 7865 } 7866 7867 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7868 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7869 7870 // Look for cached value. 7871 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7872 if (BCEntryIt != BlockMaskCache.end()) 7873 return BCEntryIt->second; 7874 7875 // All-one mask is modelled as no-mask following the convention for masked 7876 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7877 VPValue *BlockMask = nullptr; 7878 7879 if (OrigLoop->getHeader() == BB) { 7880 if (!CM.blockNeedsPredication(BB)) 7881 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7882 7883 // Create the block in mask as the first non-phi instruction in the block. 7884 VPBuilder::InsertPointGuard Guard(Builder); 7885 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7886 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7887 7888 // Introduce the early-exit compare IV <= BTC to form header block mask. 7889 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7890 // Start by constructing the desired canonical IV. 7891 VPValue *IV = nullptr; 7892 if (Legal->getPrimaryInduction()) 7893 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7894 else { 7895 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7896 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7897 IV = IVRecipe->getVPValue(); 7898 } 7899 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7900 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7901 7902 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7903 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7904 // as a second argument, we only pass the IV here and extract the 7905 // tripcount from the transform state where codegen of the VP instructions 7906 // happen. 7907 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7908 } else { 7909 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7910 } 7911 return BlockMaskCache[BB] = BlockMask; 7912 } 7913 7914 // This is the block mask. We OR all incoming edges. 7915 for (auto *Predecessor : predecessors(BB)) { 7916 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7917 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7918 return BlockMaskCache[BB] = EdgeMask; 7919 7920 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7921 BlockMask = EdgeMask; 7922 continue; 7923 } 7924 7925 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7926 } 7927 7928 return BlockMaskCache[BB] = BlockMask; 7929 } 7930 7931 VPWidenMemoryInstructionRecipe * 7932 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7933 VPlanPtr &Plan) { 7934 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7935 "Must be called with either a load or store"); 7936 7937 auto willWiden = [&](ElementCount VF) -> bool { 7938 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7939 if (VF.isScalar()) 7940 return false; 7941 LoopVectorizationCostModel::InstWidening Decision = 7942 CM.getWideningDecision(I, VF); 7943 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7944 "CM decision should be taken at this point."); 7945 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7946 return true; 7947 if (CM.isScalarAfterVectorization(I, VF) || 7948 CM.isProfitableToScalarize(I, VF)) 7949 return false; 7950 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7951 }; 7952 7953 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7954 return nullptr; 7955 7956 VPValue *Mask = nullptr; 7957 if (Legal->isMaskRequired(I)) 7958 Mask = createBlockInMask(I->getParent(), Plan); 7959 7960 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7961 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7962 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7963 7964 StoreInst *Store = cast<StoreInst>(I); 7965 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7966 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7967 } 7968 7969 VPWidenIntOrFpInductionRecipe * 7970 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7971 // Check if this is an integer or fp induction. If so, build the recipe that 7972 // produces its scalar and vector values. 7973 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7974 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7975 II.getKind() == InductionDescriptor::IK_FpInduction) 7976 return new VPWidenIntOrFpInductionRecipe(Phi); 7977 7978 return nullptr; 7979 } 7980 7981 VPWidenIntOrFpInductionRecipe * 7982 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7983 VFRange &Range) const { 7984 // Optimize the special case where the source is a constant integer 7985 // induction variable. Notice that we can only optimize the 'trunc' case 7986 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7987 // (c) other casts depend on pointer size. 7988 7989 // Determine whether \p K is a truncation based on an induction variable that 7990 // can be optimized. 7991 auto isOptimizableIVTruncate = 7992 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7993 return [=](ElementCount VF) -> bool { 7994 return CM.isOptimizableIVTruncate(K, VF); 7995 }; 7996 }; 7997 7998 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7999 isOptimizableIVTruncate(I), Range)) 8000 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8001 I); 8002 return nullptr; 8003 } 8004 8005 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8006 // We know that all PHIs in non-header blocks are converted into selects, so 8007 // we don't have to worry about the insertion order and we can just use the 8008 // builder. At this point we generate the predication tree. There may be 8009 // duplications since this is a simple recursive scan, but future 8010 // optimizations will clean it up. 8011 8012 SmallVector<VPValue *, 2> Operands; 8013 unsigned NumIncoming = Phi->getNumIncomingValues(); 8014 for (unsigned In = 0; In < NumIncoming; In++) { 8015 VPValue *EdgeMask = 8016 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8017 assert((EdgeMask || NumIncoming == 1) && 8018 "Multiple predecessors with one having a full mask"); 8019 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8020 if (EdgeMask) 8021 Operands.push_back(EdgeMask); 8022 } 8023 return new VPBlendRecipe(Phi, Operands); 8024 } 8025 8026 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8027 VPlan &Plan) const { 8028 8029 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8030 [this, CI](ElementCount VF) { 8031 return CM.isScalarWithPredication(CI, VF); 8032 }, 8033 Range); 8034 8035 if (IsPredicated) 8036 return nullptr; 8037 8038 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8039 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8040 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8041 ID == Intrinsic::pseudoprobe)) 8042 return nullptr; 8043 8044 auto willWiden = [&](ElementCount VF) -> bool { 8045 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8046 // The following case may be scalarized depending on the VF. 8047 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8048 // version of the instruction. 8049 // Is it beneficial to perform intrinsic call compared to lib call? 8050 bool NeedToScalarize = false; 8051 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8052 bool UseVectorIntrinsic = 8053 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8054 return UseVectorIntrinsic || !NeedToScalarize; 8055 }; 8056 8057 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8058 return nullptr; 8059 8060 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8061 } 8062 8063 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8064 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8065 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8066 // Instruction should be widened, unless it is scalar after vectorization, 8067 // scalarization is profitable or it is predicated. 8068 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8069 return CM.isScalarAfterVectorization(I, VF) || 8070 CM.isProfitableToScalarize(I, VF) || 8071 CM.isScalarWithPredication(I, VF); 8072 }; 8073 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8074 Range); 8075 } 8076 8077 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8078 auto IsVectorizableOpcode = [](unsigned Opcode) { 8079 switch (Opcode) { 8080 case Instruction::Add: 8081 case Instruction::And: 8082 case Instruction::AShr: 8083 case Instruction::BitCast: 8084 case Instruction::FAdd: 8085 case Instruction::FCmp: 8086 case Instruction::FDiv: 8087 case Instruction::FMul: 8088 case Instruction::FNeg: 8089 case Instruction::FPExt: 8090 case Instruction::FPToSI: 8091 case Instruction::FPToUI: 8092 case Instruction::FPTrunc: 8093 case Instruction::FRem: 8094 case Instruction::FSub: 8095 case Instruction::ICmp: 8096 case Instruction::IntToPtr: 8097 case Instruction::LShr: 8098 case Instruction::Mul: 8099 case Instruction::Or: 8100 case Instruction::PtrToInt: 8101 case Instruction::SDiv: 8102 case Instruction::Select: 8103 case Instruction::SExt: 8104 case Instruction::Shl: 8105 case Instruction::SIToFP: 8106 case Instruction::SRem: 8107 case Instruction::Sub: 8108 case Instruction::Trunc: 8109 case Instruction::UDiv: 8110 case Instruction::UIToFP: 8111 case Instruction::URem: 8112 case Instruction::Xor: 8113 case Instruction::ZExt: 8114 return true; 8115 } 8116 return false; 8117 }; 8118 8119 if (!IsVectorizableOpcode(I->getOpcode())) 8120 return nullptr; 8121 8122 // Success: widen this instruction. 8123 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8124 } 8125 8126 VPBasicBlock *VPRecipeBuilder::handleReplication( 8127 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8128 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8129 VPlanPtr &Plan) { 8130 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8131 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8132 Range); 8133 8134 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8135 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8136 Range); 8137 8138 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8139 IsUniform, IsPredicated); 8140 setRecipe(I, Recipe); 8141 Plan->addVPValue(I, Recipe); 8142 8143 // Find if I uses a predicated instruction. If so, it will use its scalar 8144 // value. Avoid hoisting the insert-element which packs the scalar value into 8145 // a vector value, as that happens iff all users use the vector value. 8146 for (auto &Op : I->operands()) 8147 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8148 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8149 PredInst2Recipe[PredInst]->setAlsoPack(false); 8150 8151 // Finalize the recipe for Instr, first if it is not predicated. 8152 if (!IsPredicated) { 8153 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8154 VPBB->appendRecipe(Recipe); 8155 return VPBB; 8156 } 8157 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8158 assert(VPBB->getSuccessors().empty() && 8159 "VPBB has successors when handling predicated replication."); 8160 // Record predicated instructions for above packing optimizations. 8161 PredInst2Recipe[I] = Recipe; 8162 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8163 VPBlockUtils::insertBlockAfter(Region, VPBB); 8164 auto *RegSucc = new VPBasicBlock(); 8165 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8166 return RegSucc; 8167 } 8168 8169 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8170 VPRecipeBase *PredRecipe, 8171 VPlanPtr &Plan) { 8172 // Instructions marked for predication are replicated and placed under an 8173 // if-then construct to prevent side-effects. 8174 8175 // Generate recipes to compute the block mask for this region. 8176 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8177 8178 // Build the triangular if-then region. 8179 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8180 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8181 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8182 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8183 auto *PHIRecipe = Instr->getType()->isVoidTy() 8184 ? nullptr 8185 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8186 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8187 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8188 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8189 8190 // Note: first set Entry as region entry and then connect successors starting 8191 // from it in order, to propagate the "parent" of each VPBasicBlock. 8192 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8193 VPBlockUtils::connectBlocks(Pred, Exit); 8194 8195 return Region; 8196 } 8197 8198 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8199 VFRange &Range, 8200 VPlanPtr &Plan) { 8201 // First, check for specific widening recipes that deal with calls, memory 8202 // operations, inductions and Phi nodes. 8203 if (auto *CI = dyn_cast<CallInst>(Instr)) 8204 return tryToWidenCall(CI, Range, *Plan); 8205 8206 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8207 return tryToWidenMemory(Instr, Range, Plan); 8208 8209 VPRecipeBase *Recipe; 8210 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8211 if (Phi->getParent() != OrigLoop->getHeader()) 8212 return tryToBlend(Phi, Plan); 8213 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 8214 return Recipe; 8215 return new VPWidenPHIRecipe(Phi); 8216 } 8217 8218 if (isa<TruncInst>(Instr) && 8219 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 8220 return Recipe; 8221 8222 if (!shouldWiden(Instr, Range)) 8223 return nullptr; 8224 8225 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8226 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8227 OrigLoop); 8228 8229 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8230 bool InvariantCond = 8231 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8232 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8233 InvariantCond); 8234 } 8235 8236 return tryToWiden(Instr, *Plan); 8237 } 8238 8239 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8240 ElementCount MaxVF) { 8241 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8242 8243 // Collect instructions from the original loop that will become trivially dead 8244 // in the vectorized loop. We don't need to vectorize these instructions. For 8245 // example, original induction update instructions can become dead because we 8246 // separately emit induction "steps" when generating code for the new loop. 8247 // Similarly, we create a new latch condition when setting up the structure 8248 // of the new loop, so the old one can become dead. 8249 SmallPtrSet<Instruction *, 4> DeadInstructions; 8250 collectTriviallyDeadInstructions(DeadInstructions); 8251 8252 // Add assume instructions we need to drop to DeadInstructions, to prevent 8253 // them from being added to the VPlan. 8254 // TODO: We only need to drop assumes in blocks that get flattend. If the 8255 // control flow is preserved, we should keep them. 8256 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8257 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8258 8259 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8260 // Dead instructions do not need sinking. Remove them from SinkAfter. 8261 for (Instruction *I : DeadInstructions) 8262 SinkAfter.erase(I); 8263 8264 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8265 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8266 VFRange SubRange = {VF, MaxVFPlusOne}; 8267 VPlans.push_back( 8268 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8269 VF = SubRange.End; 8270 } 8271 } 8272 8273 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8274 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8275 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8276 8277 // Hold a mapping from predicated instructions to their recipes, in order to 8278 // fix their AlsoPack behavior if a user is determined to replicate and use a 8279 // scalar instead of vector value. 8280 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8281 8282 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8283 8284 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8285 8286 // --------------------------------------------------------------------------- 8287 // Pre-construction: record ingredients whose recipes we'll need to further 8288 // process after constructing the initial VPlan. 8289 // --------------------------------------------------------------------------- 8290 8291 // Mark instructions we'll need to sink later and their targets as 8292 // ingredients whose recipe we'll need to record. 8293 for (auto &Entry : SinkAfter) { 8294 RecipeBuilder.recordRecipeOf(Entry.first); 8295 RecipeBuilder.recordRecipeOf(Entry.second); 8296 } 8297 for (auto &Reduction : CM.getInLoopReductionChains()) { 8298 PHINode *Phi = Reduction.first; 8299 RecurrenceDescriptor::RecurrenceKind Kind = 8300 Legal->getReductionVars()[Phi].getRecurrenceKind(); 8301 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8302 8303 RecipeBuilder.recordRecipeOf(Phi); 8304 for (auto &R : ReductionOperations) { 8305 RecipeBuilder.recordRecipeOf(R); 8306 // For min/max reducitons, where we have a pair of icmp/select, we also 8307 // need to record the ICmp recipe, so it can be removed later. 8308 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8309 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8310 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8311 } 8312 } 8313 } 8314 8315 // For each interleave group which is relevant for this (possibly trimmed) 8316 // Range, add it to the set of groups to be later applied to the VPlan and add 8317 // placeholders for its members' Recipes which we'll be replacing with a 8318 // single VPInterleaveRecipe. 8319 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8320 auto applyIG = [IG, this](ElementCount VF) -> bool { 8321 return (VF.isVector() && // Query is illegal for VF == 1 8322 CM.getWideningDecision(IG->getInsertPos(), VF) == 8323 LoopVectorizationCostModel::CM_Interleave); 8324 }; 8325 if (!getDecisionAndClampRange(applyIG, Range)) 8326 continue; 8327 InterleaveGroups.insert(IG); 8328 for (unsigned i = 0; i < IG->getFactor(); i++) 8329 if (Instruction *Member = IG->getMember(i)) 8330 RecipeBuilder.recordRecipeOf(Member); 8331 }; 8332 8333 // --------------------------------------------------------------------------- 8334 // Build initial VPlan: Scan the body of the loop in a topological order to 8335 // visit each basic block after having visited its predecessor basic blocks. 8336 // --------------------------------------------------------------------------- 8337 8338 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8339 auto Plan = std::make_unique<VPlan>(); 8340 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8341 Plan->setEntry(VPBB); 8342 8343 // Scan the body of the loop in a topological order to visit each basic block 8344 // after having visited its predecessor basic blocks. 8345 LoopBlocksDFS DFS(OrigLoop); 8346 DFS.perform(LI); 8347 8348 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8349 // Relevant instructions from basic block BB will be grouped into VPRecipe 8350 // ingredients and fill a new VPBasicBlock. 8351 unsigned VPBBsForBB = 0; 8352 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8353 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8354 VPBB = FirstVPBBForBB; 8355 Builder.setInsertPoint(VPBB); 8356 8357 // Introduce each ingredient into VPlan. 8358 // TODO: Model and preserve debug instrinsics in VPlan. 8359 for (Instruction &I : BB->instructionsWithoutDebug()) { 8360 Instruction *Instr = &I; 8361 8362 // First filter out irrelevant instructions, to ensure no recipes are 8363 // built for them. 8364 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8365 continue; 8366 8367 if (auto Recipe = 8368 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8369 // Check if the recipe can be converted to a VPValue. We need the extra 8370 // down-casting step until VPRecipeBase inherits from VPValue. 8371 VPValue *MaybeVPValue = Recipe->toVPValue(); 8372 if (!Instr->getType()->isVoidTy() && MaybeVPValue) 8373 Plan->addVPValue(Instr, MaybeVPValue); 8374 8375 RecipeBuilder.setRecipe(Instr, Recipe); 8376 VPBB->appendRecipe(Recipe); 8377 continue; 8378 } 8379 8380 // Otherwise, if all widening options failed, Instruction is to be 8381 // replicated. This may create a successor for VPBB. 8382 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8383 Instr, Range, VPBB, PredInst2Recipe, Plan); 8384 if (NextVPBB != VPBB) { 8385 VPBB = NextVPBB; 8386 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8387 : ""); 8388 } 8389 } 8390 } 8391 8392 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8393 // may also be empty, such as the last one VPBB, reflecting original 8394 // basic-blocks with no recipes. 8395 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8396 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8397 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8398 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8399 delete PreEntry; 8400 8401 // --------------------------------------------------------------------------- 8402 // Transform initial VPlan: Apply previously taken decisions, in order, to 8403 // bring the VPlan to its final state. 8404 // --------------------------------------------------------------------------- 8405 8406 // Apply Sink-After legal constraints. 8407 for (auto &Entry : SinkAfter) { 8408 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8409 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8410 Sink->moveAfter(Target); 8411 } 8412 8413 // Interleave memory: for each Interleave Group we marked earlier as relevant 8414 // for this VPlan, replace the Recipes widening its memory instructions with a 8415 // single VPInterleaveRecipe at its insertion point. 8416 for (auto IG : InterleaveGroups) { 8417 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8418 RecipeBuilder.getRecipe(IG->getInsertPos())); 8419 SmallVector<VPValue *, 4> StoredValues; 8420 for (unsigned i = 0; i < IG->getFactor(); ++i) 8421 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8422 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8423 8424 (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8425 Recipe->getMask())) 8426 ->insertBefore(Recipe); 8427 8428 for (unsigned i = 0; i < IG->getFactor(); ++i) 8429 if (Instruction *Member = IG->getMember(i)) { 8430 if (!Member->getType()->isVoidTy()) { 8431 VPValue *OriginalV = Plan->getVPValue(Member); 8432 Plan->removeVPValueFor(Member); 8433 OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member)); 8434 } 8435 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8436 } 8437 } 8438 8439 // Adjust the recipes for any inloop reductions. 8440 if (Range.Start.isVector()) 8441 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8442 8443 // Finally, if tail is folded by masking, introduce selects between the phi 8444 // and the live-out instruction of each reduction, at the end of the latch. 8445 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8446 Builder.setInsertPoint(VPBB); 8447 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8448 for (auto &Reduction : Legal->getReductionVars()) { 8449 if (CM.isInLoopReduction(Reduction.first)) 8450 continue; 8451 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8452 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8453 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8454 } 8455 } 8456 8457 std::string PlanName; 8458 raw_string_ostream RSO(PlanName); 8459 ElementCount VF = Range.Start; 8460 Plan->addVF(VF); 8461 RSO << "Initial VPlan for VF={" << VF; 8462 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8463 Plan->addVF(VF); 8464 RSO << "," << VF; 8465 } 8466 RSO << "},UF>=1"; 8467 RSO.flush(); 8468 Plan->setName(PlanName); 8469 8470 return Plan; 8471 } 8472 8473 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8474 // Outer loop handling: They may require CFG and instruction level 8475 // transformations before even evaluating whether vectorization is profitable. 8476 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8477 // the vectorization pipeline. 8478 assert(!OrigLoop->isInnermost()); 8479 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8480 8481 // Create new empty VPlan 8482 auto Plan = std::make_unique<VPlan>(); 8483 8484 // Build hierarchical CFG 8485 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8486 HCFGBuilder.buildHierarchicalCFG(); 8487 8488 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8489 VF *= 2) 8490 Plan->addVF(VF); 8491 8492 if (EnableVPlanPredication) { 8493 VPlanPredicator VPP(*Plan); 8494 VPP.predicate(); 8495 8496 // Avoid running transformation to recipes until masked code generation in 8497 // VPlan-native path is in place. 8498 return Plan; 8499 } 8500 8501 SmallPtrSet<Instruction *, 1> DeadInstructions; 8502 VPlanTransforms::VPInstructionsToVPRecipes( 8503 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8504 return Plan; 8505 } 8506 8507 // Adjust the recipes for any inloop reductions. The chain of instructions 8508 // leading from the loop exit instr to the phi need to be converted to 8509 // reductions, with one operand being vector and the other being the scalar 8510 // reduction chain. 8511 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8512 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8513 for (auto &Reduction : CM.getInLoopReductionChains()) { 8514 PHINode *Phi = Reduction.first; 8515 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8516 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8517 8518 // ReductionOperations are orders top-down from the phi's use to the 8519 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8520 // which of the two operands will remain scalar and which will be reduced. 8521 // For minmax the chain will be the select instructions. 8522 Instruction *Chain = Phi; 8523 for (Instruction *R : ReductionOperations) { 8524 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8525 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 8526 8527 VPValue *ChainOp = Plan->getVPValue(Chain); 8528 unsigned FirstOpId; 8529 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8530 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8531 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8532 "Expected to replace a VPWidenSelectSC"); 8533 FirstOpId = 1; 8534 } else { 8535 assert(isa<VPWidenRecipe>(WidenRecipe) && 8536 "Expected to replace a VPWidenSC"); 8537 FirstOpId = 0; 8538 } 8539 unsigned VecOpId = 8540 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8541 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8542 8543 auto *CondOp = CM.foldTailByMasking() 8544 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8545 : nullptr; 8546 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8547 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8548 WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe); 8549 Plan->removeVPValueFor(R); 8550 Plan->addVPValue(R, RedRecipe); 8551 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8552 WidenRecipe->eraseFromParent(); 8553 8554 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8555 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8556 VPRecipeBase *CompareRecipe = 8557 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8558 assert(isa<VPWidenRecipe>(CompareRecipe) && 8559 "Expected to replace a VPWidenSC"); 8560 assert(CompareRecipe->toVPValue()->getNumUsers() == 0 && 8561 "Expected no remaining users"); 8562 CompareRecipe->eraseFromParent(); 8563 } 8564 Chain = R; 8565 } 8566 } 8567 } 8568 8569 Value* LoopVectorizationPlanner::VPCallbackILV:: 8570 getOrCreateVectorValues(Value *V, unsigned Part) { 8571 return ILV.getOrCreateVectorValue(V, Part); 8572 } 8573 8574 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8575 Value *V, const VPIteration &Instance) { 8576 return ILV.getOrCreateScalarValue(V, Instance); 8577 } 8578 8579 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8580 VPSlotTracker &SlotTracker) const { 8581 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8582 IG->getInsertPos()->printAsOperand(O, false); 8583 O << ", "; 8584 getAddr()->printAsOperand(O, SlotTracker); 8585 VPValue *Mask = getMask(); 8586 if (Mask) { 8587 O << ", "; 8588 Mask->printAsOperand(O, SlotTracker); 8589 } 8590 for (unsigned i = 0; i < IG->getFactor(); ++i) 8591 if (Instruction *I = IG->getMember(i)) 8592 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8593 } 8594 8595 void VPWidenCallRecipe::execute(VPTransformState &State) { 8596 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8597 *this, State); 8598 } 8599 8600 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8601 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8602 this, *this, InvariantCond, State); 8603 } 8604 8605 void VPWidenRecipe::execute(VPTransformState &State) { 8606 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8607 } 8608 8609 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8610 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8611 *this, State.UF, State.VF, IsPtrLoopInvariant, 8612 IsIndexLoopInvariant, State); 8613 } 8614 8615 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8616 assert(!State.Instance && "Int or FP induction being replicated."); 8617 State.ILV->widenIntOrFpInduction(IV, Trunc); 8618 } 8619 8620 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8621 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8622 } 8623 8624 void VPBlendRecipe::execute(VPTransformState &State) { 8625 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8626 // We know that all PHIs in non-header blocks are converted into 8627 // selects, so we don't have to worry about the insertion order and we 8628 // can just use the builder. 8629 // At this point we generate the predication tree. There may be 8630 // duplications since this is a simple recursive scan, but future 8631 // optimizations will clean it up. 8632 8633 unsigned NumIncoming = getNumIncomingValues(); 8634 8635 // Generate a sequence of selects of the form: 8636 // SELECT(Mask3, In3, 8637 // SELECT(Mask2, In2, 8638 // SELECT(Mask1, In1, 8639 // In0))) 8640 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8641 // are essentially undef are taken from In0. 8642 InnerLoopVectorizer::VectorParts Entry(State.UF); 8643 for (unsigned In = 0; In < NumIncoming; ++In) { 8644 for (unsigned Part = 0; Part < State.UF; ++Part) { 8645 // We might have single edge PHIs (blocks) - use an identity 8646 // 'select' for the first PHI operand. 8647 Value *In0 = State.get(getIncomingValue(In), Part); 8648 if (In == 0) 8649 Entry[Part] = In0; // Initialize with the first incoming value. 8650 else { 8651 // Select between the current value and the previous incoming edge 8652 // based on the incoming mask. 8653 Value *Cond = State.get(getMask(In), Part); 8654 Entry[Part] = 8655 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8656 } 8657 } 8658 } 8659 for (unsigned Part = 0; Part < State.UF; ++Part) 8660 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8661 } 8662 8663 void VPInterleaveRecipe::execute(VPTransformState &State) { 8664 assert(!State.Instance && "Interleave group being replicated."); 8665 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(), 8666 getMask()); 8667 } 8668 8669 void VPReductionRecipe::execute(VPTransformState &State) { 8670 assert(!State.Instance && "Reduction being replicated."); 8671 for (unsigned Part = 0; Part < State.UF; ++Part) { 8672 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8673 Value *NewVecOp = State.get(getVecOp(), Part); 8674 if (VPValue *Cond = getCondOp()) { 8675 Value *NewCond = State.get(Cond, Part); 8676 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8677 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8678 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8679 Constant *IdenVec = 8680 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8681 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8682 NewVecOp = Select; 8683 } 8684 Value *NewRed = 8685 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8686 Value *PrevInChain = State.get(getChainOp(), Part); 8687 Value *NextInChain; 8688 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8689 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8690 NextInChain = 8691 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8692 NewRed, PrevInChain); 8693 } else { 8694 NextInChain = State.Builder.CreateBinOp( 8695 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8696 PrevInChain); 8697 } 8698 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8699 } 8700 } 8701 8702 void VPReplicateRecipe::execute(VPTransformState &State) { 8703 if (State.Instance) { // Generate a single instance. 8704 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8705 *State.Instance, IsPredicated, State); 8706 // Insert scalar instance packing it into a vector. 8707 if (AlsoPack && State.VF.isVector()) { 8708 // If we're constructing lane 0, initialize to start from undef. 8709 if (State.Instance->Lane == 0) { 8710 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8711 Value *Undef = UndefValue::get( 8712 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8713 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8714 State.Instance->Part, Undef); 8715 } 8716 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8717 *State.Instance); 8718 } 8719 return; 8720 } 8721 8722 // Generate scalar instances for all VF lanes of all UF parts, unless the 8723 // instruction is uniform inwhich case generate only the first lane for each 8724 // of the UF parts. 8725 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8726 for (unsigned Part = 0; Part < State.UF; ++Part) 8727 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8728 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8729 IsPredicated, State); 8730 } 8731 8732 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8733 assert(State.Instance && "Branch on Mask works only on single instance."); 8734 8735 unsigned Part = State.Instance->Part; 8736 unsigned Lane = State.Instance->Lane; 8737 8738 Value *ConditionBit = nullptr; 8739 VPValue *BlockInMask = getMask(); 8740 if (BlockInMask) { 8741 ConditionBit = State.get(BlockInMask, Part); 8742 if (ConditionBit->getType()->isVectorTy()) 8743 ConditionBit = State.Builder.CreateExtractElement( 8744 ConditionBit, State.Builder.getInt32(Lane)); 8745 } else // Block in mask is all-one. 8746 ConditionBit = State.Builder.getTrue(); 8747 8748 // Replace the temporary unreachable terminator with a new conditional branch, 8749 // whose two destinations will be set later when they are created. 8750 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8751 assert(isa<UnreachableInst>(CurrentTerminator) && 8752 "Expected to replace unreachable terminator with conditional branch."); 8753 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8754 CondBr->setSuccessor(0, nullptr); 8755 ReplaceInstWithInst(CurrentTerminator, CondBr); 8756 } 8757 8758 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8759 assert(State.Instance && "Predicated instruction PHI works per instance."); 8760 Instruction *ScalarPredInst = 8761 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8762 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8763 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8764 assert(PredicatingBB && "Predicated block has no single predecessor."); 8765 8766 // By current pack/unpack logic we need to generate only a single phi node: if 8767 // a vector value for the predicated instruction exists at this point it means 8768 // the instruction has vector users only, and a phi for the vector value is 8769 // needed. In this case the recipe of the predicated instruction is marked to 8770 // also do that packing, thereby "hoisting" the insert-element sequence. 8771 // Otherwise, a phi node for the scalar value is needed. 8772 unsigned Part = State.Instance->Part; 8773 Instruction *PredInst = 8774 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8775 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8776 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8777 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8778 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8779 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8780 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8781 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8782 } else { 8783 Type *PredInstType = PredInst->getType(); 8784 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8785 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8786 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8787 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8788 } 8789 } 8790 8791 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8792 Instruction *Instr = getUnderlyingInstr(); 8793 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8794 State.ILV->vectorizeMemoryInstruction(Instr, State, 8795 StoredValue ? nullptr : this, getAddr(), 8796 StoredValue, getMask()); 8797 } 8798 8799 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8800 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8801 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8802 // for predication. 8803 static ScalarEpilogueLowering getScalarEpilogueLowering( 8804 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8805 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8806 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8807 LoopVectorizationLegality &LVL) { 8808 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8809 // don't look at hints or options, and don't request a scalar epilogue. 8810 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8811 // LoopAccessInfo (due to code dependency and not being able to reliably get 8812 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8813 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8814 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8815 // back to the old way and vectorize with versioning when forced. See D81345.) 8816 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8817 PGSOQueryType::IRPass) && 8818 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8819 return CM_ScalarEpilogueNotAllowedOptSize; 8820 8821 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8822 !PreferPredicateOverEpilogue; 8823 8824 // 2) Next, if disabling predication is requested on the command line, honour 8825 // this and request a scalar epilogue. 8826 if (PredicateOptDisabled) 8827 return CM_ScalarEpilogueAllowed; 8828 8829 // 3) and 4) look if enabling predication is requested on the command line, 8830 // with a loop hint, or if the TTI hook indicates this is profitable, request 8831 // predication. 8832 if (PreferPredicateOverEpilogue || 8833 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8834 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8835 LVL.getLAI()) && 8836 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8837 return CM_ScalarEpilogueNotNeededUsePredicate; 8838 8839 return CM_ScalarEpilogueAllowed; 8840 } 8841 8842 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8843 unsigned Part) { 8844 set(Def, V, Part); 8845 ILV->setVectorValue(IRDef, Part, V); 8846 } 8847 8848 // Process the loop in the VPlan-native vectorization path. This path builds 8849 // VPlan upfront in the vectorization pipeline, which allows to apply 8850 // VPlan-to-VPlan transformations from the very beginning without modifying the 8851 // input LLVM IR. 8852 static bool processLoopInVPlanNativePath( 8853 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8854 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8855 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8856 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8857 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8858 8859 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8860 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8861 return false; 8862 } 8863 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8864 Function *F = L->getHeader()->getParent(); 8865 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8866 8867 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8868 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8869 8870 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8871 &Hints, IAI); 8872 // Use the planner for outer loop vectorization. 8873 // TODO: CM is not used at this point inside the planner. Turn CM into an 8874 // optional argument if we don't need it in the future. 8875 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8876 8877 // Get user vectorization factor. 8878 ElementCount UserVF = Hints.getWidth(); 8879 if (UserVF.isScalable()) { 8880 // TODO: Use scalable UserVF once we've added initial support for scalable 8881 // vectorization. For now we convert it to fixed width, but this will be 8882 // removed in a later patch. 8883 UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); 8884 } 8885 8886 // Plan how to best vectorize, return the best VF and its cost. 8887 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8888 8889 // If we are stress testing VPlan builds, do not attempt to generate vector 8890 // code. Masked vector code generation support will follow soon. 8891 // Also, do not attempt to vectorize if no vector code will be produced. 8892 if (VPlanBuildStressTest || EnableVPlanPredication || 8893 VectorizationFactor::Disabled() == VF) 8894 return false; 8895 8896 LVP.setBestPlan(VF.Width, 1); 8897 8898 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8899 &CM, BFI, PSI); 8900 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8901 << L->getHeader()->getParent()->getName() << "\"\n"); 8902 LVP.executePlan(LB, DT); 8903 8904 // Mark the loop as already vectorized to avoid vectorizing again. 8905 Hints.setAlreadyVectorized(); 8906 8907 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8908 return true; 8909 } 8910 8911 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8912 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8913 !EnableLoopInterleaving), 8914 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8915 !EnableLoopVectorization) {} 8916 8917 bool LoopVectorizePass::processLoop(Loop *L) { 8918 assert((EnableVPlanNativePath || L->isInnermost()) && 8919 "VPlan-native path is not enabled. Only process inner loops."); 8920 8921 #ifndef NDEBUG 8922 const std::string DebugLocStr = getDebugLocString(L); 8923 #endif /* NDEBUG */ 8924 8925 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8926 << L->getHeader()->getParent()->getName() << "\" from " 8927 << DebugLocStr << "\n"); 8928 8929 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8930 8931 LLVM_DEBUG( 8932 dbgs() << "LV: Loop hints:" 8933 << " force=" 8934 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8935 ? "disabled" 8936 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8937 ? "enabled" 8938 : "?")) 8939 << " width=" << Hints.getWidth() 8940 << " unroll=" << Hints.getInterleave() << "\n"); 8941 8942 // Function containing loop 8943 Function *F = L->getHeader()->getParent(); 8944 8945 // Looking at the diagnostic output is the only way to determine if a loop 8946 // was vectorized (other than looking at the IR or machine code), so it 8947 // is important to generate an optimization remark for each loop. Most of 8948 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8949 // generated as OptimizationRemark and OptimizationRemarkMissed are 8950 // less verbose reporting vectorized loops and unvectorized loops that may 8951 // benefit from vectorization, respectively. 8952 8953 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8954 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8955 return false; 8956 } 8957 8958 PredicatedScalarEvolution PSE(*SE, *L); 8959 8960 // Check if it is legal to vectorize the loop. 8961 LoopVectorizationRequirements Requirements(*ORE); 8962 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8963 &Requirements, &Hints, DB, AC, BFI, PSI); 8964 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8965 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8966 Hints.emitRemarkWithHints(); 8967 return false; 8968 } 8969 8970 // Check the function attributes and profiles to find out if this function 8971 // should be optimized for size. 8972 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8973 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8974 8975 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8976 // here. They may require CFG and instruction level transformations before 8977 // even evaluating whether vectorization is profitable. Since we cannot modify 8978 // the incoming IR, we need to build VPlan upfront in the vectorization 8979 // pipeline. 8980 if (!L->isInnermost()) 8981 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8982 ORE, BFI, PSI, Hints); 8983 8984 assert(L->isInnermost() && "Inner loop expected."); 8985 8986 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8987 // count by optimizing for size, to minimize overheads. 8988 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8989 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8990 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8991 << "This loop is worth vectorizing only if no scalar " 8992 << "iteration overheads are incurred."); 8993 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8994 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8995 else { 8996 LLVM_DEBUG(dbgs() << "\n"); 8997 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8998 } 8999 } 9000 9001 // Check the function attributes to see if implicit floats are allowed. 9002 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9003 // an integer loop and the vector instructions selected are purely integer 9004 // vector instructions? 9005 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9006 reportVectorizationFailure( 9007 "Can't vectorize when the NoImplicitFloat attribute is used", 9008 "loop not vectorized due to NoImplicitFloat attribute", 9009 "NoImplicitFloat", ORE, L); 9010 Hints.emitRemarkWithHints(); 9011 return false; 9012 } 9013 9014 // Check if the target supports potentially unsafe FP vectorization. 9015 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9016 // for the target we're vectorizing for, to make sure none of the 9017 // additional fp-math flags can help. 9018 if (Hints.isPotentiallyUnsafe() && 9019 TTI->isFPVectorizationPotentiallyUnsafe()) { 9020 reportVectorizationFailure( 9021 "Potentially unsafe FP op prevents vectorization", 9022 "loop not vectorized due to unsafe FP support.", 9023 "UnsafeFP", ORE, L); 9024 Hints.emitRemarkWithHints(); 9025 return false; 9026 } 9027 9028 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9029 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9030 9031 // If an override option has been passed in for interleaved accesses, use it. 9032 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9033 UseInterleaved = EnableInterleavedMemAccesses; 9034 9035 // Analyze interleaved memory accesses. 9036 if (UseInterleaved) { 9037 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9038 } 9039 9040 // Use the cost model. 9041 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9042 F, &Hints, IAI); 9043 CM.collectValuesToIgnore(); 9044 9045 // Use the planner for vectorization. 9046 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9047 9048 // Get user vectorization factor and interleave count. 9049 ElementCount UserVF = Hints.getWidth(); 9050 if (UserVF.isScalable()) { 9051 // TODO: Use scalable UserVF once we've added initial support for scalable 9052 // vectorization. For now we convert it to fixed width, but this will be 9053 // removed in a later patch. 9054 UserVF = ElementCount::getFixed(UserVF.getKnownMinValue()); 9055 } 9056 9057 unsigned UserIC = Hints.getInterleave(); 9058 9059 // Plan how to best vectorize, return the best VF and its cost. 9060 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9061 9062 VectorizationFactor VF = VectorizationFactor::Disabled(); 9063 unsigned IC = 1; 9064 9065 if (MaybeVF) { 9066 VF = *MaybeVF; 9067 // Select the interleave count. 9068 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9069 } 9070 9071 // Identify the diagnostic messages that should be produced. 9072 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9073 bool VectorizeLoop = true, InterleaveLoop = true; 9074 if (Requirements.doesNotMeet(F, L, Hints)) { 9075 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9076 "requirements.\n"); 9077 Hints.emitRemarkWithHints(); 9078 return false; 9079 } 9080 9081 if (VF.Width.isScalar()) { 9082 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9083 VecDiagMsg = std::make_pair( 9084 "VectorizationNotBeneficial", 9085 "the cost-model indicates that vectorization is not beneficial"); 9086 VectorizeLoop = false; 9087 } 9088 9089 if (!MaybeVF && UserIC > 1) { 9090 // Tell the user interleaving was avoided up-front, despite being explicitly 9091 // requested. 9092 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9093 "interleaving should be avoided up front\n"); 9094 IntDiagMsg = std::make_pair( 9095 "InterleavingAvoided", 9096 "Ignoring UserIC, because interleaving was avoided up front"); 9097 InterleaveLoop = false; 9098 } else if (IC == 1 && UserIC <= 1) { 9099 // Tell the user interleaving is not beneficial. 9100 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9101 IntDiagMsg = std::make_pair( 9102 "InterleavingNotBeneficial", 9103 "the cost-model indicates that interleaving is not beneficial"); 9104 InterleaveLoop = false; 9105 if (UserIC == 1) { 9106 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9107 IntDiagMsg.second += 9108 " and is explicitly disabled or interleave count is set to 1"; 9109 } 9110 } else if (IC > 1 && UserIC == 1) { 9111 // Tell the user interleaving is beneficial, but it explicitly disabled. 9112 LLVM_DEBUG( 9113 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9114 IntDiagMsg = std::make_pair( 9115 "InterleavingBeneficialButDisabled", 9116 "the cost-model indicates that interleaving is beneficial " 9117 "but is explicitly disabled or interleave count is set to 1"); 9118 InterleaveLoop = false; 9119 } 9120 9121 // Override IC if user provided an interleave count. 9122 IC = UserIC > 0 ? UserIC : IC; 9123 9124 // Emit diagnostic messages, if any. 9125 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9126 if (!VectorizeLoop && !InterleaveLoop) { 9127 // Do not vectorize or interleaving the loop. 9128 ORE->emit([&]() { 9129 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9130 L->getStartLoc(), L->getHeader()) 9131 << VecDiagMsg.second; 9132 }); 9133 ORE->emit([&]() { 9134 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9135 L->getStartLoc(), L->getHeader()) 9136 << IntDiagMsg.second; 9137 }); 9138 return false; 9139 } else if (!VectorizeLoop && InterleaveLoop) { 9140 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9141 ORE->emit([&]() { 9142 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9143 L->getStartLoc(), L->getHeader()) 9144 << VecDiagMsg.second; 9145 }); 9146 } else if (VectorizeLoop && !InterleaveLoop) { 9147 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9148 << ") in " << DebugLocStr << '\n'); 9149 ORE->emit([&]() { 9150 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9151 L->getStartLoc(), L->getHeader()) 9152 << IntDiagMsg.second; 9153 }); 9154 } else if (VectorizeLoop && InterleaveLoop) { 9155 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9156 << ") in " << DebugLocStr << '\n'); 9157 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9158 } 9159 9160 LVP.setBestPlan(VF.Width, IC); 9161 9162 using namespace ore; 9163 bool DisableRuntimeUnroll = false; 9164 MDNode *OrigLoopID = L->getLoopID(); 9165 9166 if (!VectorizeLoop) { 9167 assert(IC > 1 && "interleave count should not be 1 or 0"); 9168 // If we decided that it is not legal to vectorize the loop, then 9169 // interleave it. 9170 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9171 BFI, PSI); 9172 LVP.executePlan(Unroller, DT); 9173 9174 ORE->emit([&]() { 9175 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9176 L->getHeader()) 9177 << "interleaved loop (interleaved count: " 9178 << NV("InterleaveCount", IC) << ")"; 9179 }); 9180 } else { 9181 // If we decided that it is *legal* to vectorize the loop, then do it. 9182 9183 // Consider vectorizing the epilogue too if it's profitable. 9184 VectorizationFactor EpilogueVF = 9185 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9186 if (EpilogueVF.Width.isVector()) { 9187 9188 // The first pass vectorizes the main loop and creates a scalar epilogue 9189 // to be vectorized by executing the plan (potentially with a different 9190 // factor) again shortly afterwards. 9191 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9192 EpilogueVF.Width.getKnownMinValue(), 1); 9193 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9194 &LVL, &CM, BFI, PSI); 9195 9196 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9197 LVP.executePlan(MainILV, DT); 9198 ++LoopsVectorized; 9199 9200 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9201 formLCSSARecursively(*L, *DT, LI, SE); 9202 9203 // Second pass vectorizes the epilogue and adjusts the control flow 9204 // edges from the first pass. 9205 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9206 EPI.MainLoopVF = EPI.EpilogueVF; 9207 EPI.MainLoopUF = EPI.EpilogueUF; 9208 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9209 ORE, EPI, &LVL, &CM, BFI, PSI); 9210 LVP.executePlan(EpilogILV, DT); 9211 ++LoopsEpilogueVectorized; 9212 9213 if (!MainILV.areSafetyChecksAdded()) 9214 DisableRuntimeUnroll = true; 9215 } else { 9216 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9217 &LVL, &CM, BFI, PSI); 9218 LVP.executePlan(LB, DT); 9219 ++LoopsVectorized; 9220 9221 // Add metadata to disable runtime unrolling a scalar loop when there are 9222 // no runtime checks about strides and memory. A scalar loop that is 9223 // rarely used is not worth unrolling. 9224 if (!LB.areSafetyChecksAdded()) 9225 DisableRuntimeUnroll = true; 9226 } 9227 9228 // Report the vectorization decision. 9229 ORE->emit([&]() { 9230 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9231 L->getHeader()) 9232 << "vectorized loop (vectorization width: " 9233 << NV("VectorizationFactor", VF.Width) 9234 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9235 }); 9236 } 9237 9238 Optional<MDNode *> RemainderLoopID = 9239 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9240 LLVMLoopVectorizeFollowupEpilogue}); 9241 if (RemainderLoopID.hasValue()) { 9242 L->setLoopID(RemainderLoopID.getValue()); 9243 } else { 9244 if (DisableRuntimeUnroll) 9245 AddRuntimeUnrollDisableMetaData(L); 9246 9247 // Mark the loop as already vectorized to avoid vectorizing again. 9248 Hints.setAlreadyVectorized(); 9249 } 9250 9251 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9252 return true; 9253 } 9254 9255 LoopVectorizeResult LoopVectorizePass::runImpl( 9256 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9257 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9258 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9259 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9260 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9261 SE = &SE_; 9262 LI = &LI_; 9263 TTI = &TTI_; 9264 DT = &DT_; 9265 BFI = &BFI_; 9266 TLI = TLI_; 9267 AA = &AA_; 9268 AC = &AC_; 9269 GetLAA = &GetLAA_; 9270 DB = &DB_; 9271 ORE = &ORE_; 9272 PSI = PSI_; 9273 9274 // Don't attempt if 9275 // 1. the target claims to have no vector registers, and 9276 // 2. interleaving won't help ILP. 9277 // 9278 // The second condition is necessary because, even if the target has no 9279 // vector registers, loop vectorization may still enable scalar 9280 // interleaving. 9281 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9282 TTI->getMaxInterleaveFactor(1) < 2) 9283 return LoopVectorizeResult(false, false); 9284 9285 bool Changed = false, CFGChanged = false; 9286 9287 // The vectorizer requires loops to be in simplified form. 9288 // Since simplification may add new inner loops, it has to run before the 9289 // legality and profitability checks. This means running the loop vectorizer 9290 // will simplify all loops, regardless of whether anything end up being 9291 // vectorized. 9292 for (auto &L : *LI) 9293 Changed |= CFGChanged |= 9294 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9295 9296 // Build up a worklist of inner-loops to vectorize. This is necessary as 9297 // the act of vectorizing or partially unrolling a loop creates new loops 9298 // and can invalidate iterators across the loops. 9299 SmallVector<Loop *, 8> Worklist; 9300 9301 for (Loop *L : *LI) 9302 collectSupportedLoops(*L, LI, ORE, Worklist); 9303 9304 LoopsAnalyzed += Worklist.size(); 9305 9306 // Now walk the identified inner loops. 9307 while (!Worklist.empty()) { 9308 Loop *L = Worklist.pop_back_val(); 9309 9310 // For the inner loops we actually process, form LCSSA to simplify the 9311 // transform. 9312 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9313 9314 Changed |= CFGChanged |= processLoop(L); 9315 } 9316 9317 // Process each loop nest in the function. 9318 return LoopVectorizeResult(Changed, CFGChanged); 9319 } 9320 9321 PreservedAnalyses LoopVectorizePass::run(Function &F, 9322 FunctionAnalysisManager &AM) { 9323 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9324 auto &LI = AM.getResult<LoopAnalysis>(F); 9325 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9326 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9327 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9328 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9329 auto &AA = AM.getResult<AAManager>(F); 9330 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9331 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9332 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9333 MemorySSA *MSSA = EnableMSSALoopDependency 9334 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9335 : nullptr; 9336 9337 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9338 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9339 [&](Loop &L) -> const LoopAccessInfo & { 9340 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9341 TLI, TTI, nullptr, MSSA}; 9342 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9343 }; 9344 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9345 ProfileSummaryInfo *PSI = 9346 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9347 LoopVectorizeResult Result = 9348 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9349 if (!Result.MadeAnyChange) 9350 return PreservedAnalyses::all(); 9351 PreservedAnalyses PA; 9352 9353 // We currently do not preserve loopinfo/dominator analyses with outer loop 9354 // vectorization. Until this is addressed, mark these analyses as preserved 9355 // only for non-VPlan-native path. 9356 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9357 if (!EnableVPlanNativePath) { 9358 PA.preserve<LoopAnalysis>(); 9359 PA.preserve<DominatorTreeAnalysis>(); 9360 } 9361 PA.preserve<BasicAA>(); 9362 PA.preserve<GlobalsAA>(); 9363 if (!Result.MadeCFGChange) 9364 PA.preserveSet<CFGAnalyses>(); 9365 return PA; 9366 } 9367