1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 // Determine if an array of VF elements of type Ty is "bitcast compatible" 371 // with a <VF x Ty> vector. 372 if (VF.isVector()) { 373 auto *VectorTy = VectorType::get(Ty, VF); 374 return TypeSize::get(VF.getKnownMinValue() * 375 DL.getTypeAllocSize(Ty).getFixedValue(), 376 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 377 } 378 379 // If the vectorization factor is one, we just check if an array of type Ty 380 // requires padding between elements. 381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 382 } 383 384 /// A helper function that returns the reciprocal of the block probability of 385 /// predicated blocks. If we return X, we are assuming the predicated block 386 /// will execute once for every X iterations of the loop header. 387 /// 388 /// TODO: We should use actual block probability here, if available. Currently, 389 /// we always assume predicated blocks have a 50% chance of executing. 390 static unsigned getReciprocalPredBlockProb() { return 2; } 391 392 /// A helper function that adds a 'fast' flag to floating-point operations. 393 static Value *addFastMathFlag(Value *V) { 394 if (isa<FPMathOperator>(V)) 395 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 396 return V; 397 } 398 399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 400 if (isa<FPMathOperator>(V)) 401 cast<Instruction>(V)->setFastMathFlags(FMF); 402 return V; 403 } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 namespace llvm { 436 437 /// InnerLoopVectorizer vectorizes loops which contain only one basic 438 /// block to a specified vectorization factor (VF). 439 /// This class performs the widening of scalars into vectors, or multiple 440 /// scalars. This class also implements the following features: 441 /// * It inserts an epilogue loop for handling loops that don't have iteration 442 /// counts that are known to be a multiple of the vectorization factor. 443 /// * It handles the code generation for reduction variables. 444 /// * Scalarization (implementation using scalars) of un-vectorizable 445 /// instructions. 446 /// InnerLoopVectorizer does not perform any vectorization-legality 447 /// checks, and relies on the caller to check for the different legality 448 /// aspects. The InnerLoopVectorizer relies on the 449 /// LoopVectorizationLegality class to provide information about the induction 450 /// and reduction variables that were found to a given vectorization factor. 451 class InnerLoopVectorizer { 452 public: 453 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 454 LoopInfo *LI, DominatorTree *DT, 455 const TargetLibraryInfo *TLI, 456 const TargetTransformInfo *TTI, AssumptionCache *AC, 457 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 458 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 459 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 460 ProfileSummaryInfo *PSI) 461 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 462 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 463 Builder(PSE.getSE()->getContext()), 464 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 465 BFI(BFI), PSI(PSI) { 466 // Query this against the original loop and save it here because the profile 467 // of the original loop header may change as the transformation happens. 468 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 469 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 470 } 471 472 virtual ~InnerLoopVectorizer() = default; 473 474 /// Create a new empty loop that will contain vectorized instructions later 475 /// on, while the old loop will be used as the scalar remainder. Control flow 476 /// is generated around the vectorized (and scalar epilogue) loops consisting 477 /// of various checks and bypasses. Return the pre-header block of the new 478 /// loop. 479 /// In the case of epilogue vectorization, this function is overriden to 480 /// handle the more complex control flow around the loops. 481 virtual BasicBlock *createVectorizedLoopSkeleton(); 482 483 /// Widen a single instruction within the innermost loop. 484 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 485 VPTransformState &State); 486 487 /// Widen a single call instruction within the innermost loop. 488 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 489 VPTransformState &State); 490 491 /// Widen a single select instruction within the innermost loop. 492 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 493 bool InvariantCond, VPTransformState &State); 494 495 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 496 void fixVectorizedLoop(); 497 498 // Return true if any runtime check is added. 499 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 500 501 /// A type for vectorized values in the new loop. Each value from the 502 /// original loop, when vectorized, is represented by UF vector values in the 503 /// new unrolled loop, where UF is the unroll factor. 504 using VectorParts = SmallVector<Value *, 2>; 505 506 /// Vectorize a single GetElementPtrInst based on information gathered and 507 /// decisions taken during planning. 508 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 509 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 510 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 511 512 /// Vectorize a single PHINode in a block. This method handles the induction 513 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 514 /// arbitrary length vectors. 515 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 516 517 /// A helper function to scalarize a single Instruction in the innermost loop. 518 /// Generates a sequence of scalar instances for each lane between \p MinLane 519 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 520 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 521 /// Instr's operands. 522 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 523 const VPIteration &Instance, bool IfPredicateInstr, 524 VPTransformState &State); 525 526 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 527 /// is provided, the integer induction variable will first be truncated to 528 /// the corresponding type. 529 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 530 531 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 532 /// vector or scalar value on-demand if one is not yet available. When 533 /// vectorizing a loop, we visit the definition of an instruction before its 534 /// uses. When visiting the definition, we either vectorize or scalarize the 535 /// instruction, creating an entry for it in the corresponding map. (In some 536 /// cases, such as induction variables, we will create both vector and scalar 537 /// entries.) Then, as we encounter uses of the definition, we derive values 538 /// for each scalar or vector use unless such a value is already available. 539 /// For example, if we scalarize a definition and one of its uses is vector, 540 /// we build the required vector on-demand with an insertelement sequence 541 /// when visiting the use. Otherwise, if the use is scalar, we can use the 542 /// existing scalar definition. 543 /// 544 /// Return a value in the new loop corresponding to \p V from the original 545 /// loop at unroll index \p Part. If the value has already been vectorized, 546 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 547 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 548 /// a new vector value on-demand by inserting the scalar values into a vector 549 /// with an insertelement sequence. If the value has been neither vectorized 550 /// nor scalarized, it must be loop invariant, so we simply broadcast the 551 /// value into a vector. 552 Value *getOrCreateVectorValue(Value *V, unsigned Part); 553 554 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 555 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 556 } 557 558 /// Return a value in the new loop corresponding to \p V from the original 559 /// loop at unroll and vector indices \p Instance. If the value has been 560 /// vectorized but not scalarized, the necessary extractelement instruction 561 /// will be generated. 562 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 563 564 /// Construct the vector value of a scalarized value \p V one lane at a time. 565 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 566 567 /// Try to vectorize interleaved access group \p Group with the base address 568 /// given in \p Addr, optionally masking the vector operations if \p 569 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 570 /// values in the vectorized loop. 571 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 572 ArrayRef<VPValue *> VPDefs, 573 VPTransformState &State, VPValue *Addr, 574 ArrayRef<VPValue *> StoredValues, 575 VPValue *BlockInMask = nullptr); 576 577 /// Vectorize Load and Store instructions with the base address given in \p 578 /// Addr, optionally masking the vector operations if \p BlockInMask is 579 /// non-null. Use \p State to translate given VPValues to IR values in the 580 /// vectorized loop. 581 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 582 VPValue *Def, VPValue *Addr, 583 VPValue *StoredValue, VPValue *BlockInMask); 584 585 /// Set the debug location in the builder using the debug location in 586 /// the instruction. 587 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 588 589 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 590 void fixNonInductionPHIs(void); 591 592 protected: 593 friend class LoopVectorizationPlanner; 594 595 /// A small list of PHINodes. 596 using PhiVector = SmallVector<PHINode *, 4>; 597 598 /// A type for scalarized values in the new loop. Each value from the 599 /// original loop, when scalarized, is represented by UF x VF scalar values 600 /// in the new unrolled loop, where UF is the unroll factor and VF is the 601 /// vectorization factor. 602 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 603 604 /// Set up the values of the IVs correctly when exiting the vector loop. 605 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 606 Value *CountRoundDown, Value *EndValue, 607 BasicBlock *MiddleBlock); 608 609 /// Create a new induction variable inside L. 610 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 611 Value *Step, Instruction *DL); 612 613 /// Handle all cross-iteration phis in the header. 614 void fixCrossIterationPHIs(); 615 616 /// Fix a first-order recurrence. This is the second phase of vectorizing 617 /// this phi node. 618 void fixFirstOrderRecurrence(PHINode *Phi); 619 620 /// Fix a reduction cross-iteration phi. This is the second phase of 621 /// vectorizing this phi node. 622 void fixReduction(PHINode *Phi); 623 624 /// Clear NSW/NUW flags from reduction instructions if necessary. 625 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 626 627 /// The Loop exit block may have single value PHI nodes with some 628 /// incoming value. While vectorizing we only handled real values 629 /// that were defined inside the loop and we should have one value for 630 /// each predecessor of its parent basic block. See PR14725. 631 void fixLCSSAPHIs(); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(); 640 641 /// Create a broadcast instruction. This method generates a broadcast 642 /// instruction (shuffle) for loop invariant values and for the induction 643 /// value. If this is the induction variable then we extend it to N, N+1, ... 644 /// this is needed because each iteration in the loop corresponds to a SIMD 645 /// element. 646 virtual Value *getBroadcastInstrs(Value *V); 647 648 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 649 /// to each vector element of Val. The sequence starts at StartIndex. 650 /// \p Opcode is relevant for FP induction variable. 651 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 652 Instruction::BinaryOps Opcode = 653 Instruction::BinaryOpsEnd); 654 655 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 656 /// variable on which to base the steps, \p Step is the size of the step, and 657 /// \p EntryVal is the value from the original loop that maps to the steps. 658 /// Note that \p EntryVal doesn't have to be an induction variable - it 659 /// can also be a truncate instruction. 660 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 661 const InductionDescriptor &ID); 662 663 /// Create a vector induction phi node based on an existing scalar one. \p 664 /// EntryVal is the value from the original loop that maps to the vector phi 665 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 666 /// truncate instruction, instead of widening the original IV, we widen a 667 /// version of the IV truncated to \p EntryVal's type. 668 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 669 Value *Step, Instruction *EntryVal); 670 671 /// Returns true if an instruction \p I should be scalarized instead of 672 /// vectorized for the chosen vectorization factor. 673 bool shouldScalarizeInstruction(Instruction *I) const; 674 675 /// Returns true if we should generate a scalar version of \p IV. 676 bool needsScalarInduction(Instruction *IV) const; 677 678 /// If there is a cast involved in the induction variable \p ID, which should 679 /// be ignored in the vectorized loop body, this function records the 680 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 681 /// cast. We had already proved that the casted Phi is equal to the uncasted 682 /// Phi in the vectorized loop (under a runtime guard), and therefore 683 /// there is no need to vectorize the cast - the same value can be used in the 684 /// vector loop for both the Phi and the cast. 685 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 686 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 687 /// 688 /// \p EntryVal is the value from the original loop that maps to the vector 689 /// phi node and is used to distinguish what is the IV currently being 690 /// processed - original one (if \p EntryVal is a phi corresponding to the 691 /// original IV) or the "newly-created" one based on the proof mentioned above 692 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 693 /// latter case \p EntryVal is a TruncInst and we must not record anything for 694 /// that IV, but it's error-prone to expect callers of this routine to care 695 /// about that, hence this explicit parameter. 696 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 697 const Instruction *EntryVal, 698 Value *VectorLoopValue, 699 unsigned Part, 700 unsigned Lane = UINT_MAX); 701 702 /// Generate a shuffle sequence that will reverse the vector Vec. 703 virtual Value *reverseVector(Value *Vec); 704 705 /// Returns (and creates if needed) the original loop trip count. 706 Value *getOrCreateTripCount(Loop *NewLoop); 707 708 /// Returns (and creates if needed) the trip count of the widened loop. 709 Value *getOrCreateVectorTripCount(Loop *NewLoop); 710 711 /// Returns a bitcasted value to the requested vector type. 712 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 713 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 714 const DataLayout &DL); 715 716 /// Emit a bypass check to see if the vector trip count is zero, including if 717 /// it overflows. 718 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 719 720 /// Emit a bypass check to see if all of the SCEV assumptions we've 721 /// had to make are correct. 722 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 723 724 /// Emit bypass checks to check any memory assumptions we may have made. 725 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 726 727 /// Compute the transformed value of Index at offset StartValue using step 728 /// StepValue. 729 /// For integer induction, returns StartValue + Index * StepValue. 730 /// For pointer induction, returns StartValue[Index * StepValue]. 731 /// FIXME: The newly created binary instructions should contain nsw/nuw 732 /// flags, which can be found from the original scalar operations. 733 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 734 const DataLayout &DL, 735 const InductionDescriptor &ID) const; 736 737 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 738 /// vector loop preheader, middle block and scalar preheader. Also 739 /// allocate a loop object for the new vector loop and return it. 740 Loop *createVectorLoopSkeleton(StringRef Prefix); 741 742 /// Create new phi nodes for the induction variables to resume iteration count 743 /// in the scalar epilogue, from where the vectorized loop left off (given by 744 /// \p VectorTripCount). 745 /// In cases where the loop skeleton is more complicated (eg. epilogue 746 /// vectorization) and the resume values can come from an additional bypass 747 /// block, the \p AdditionalBypass pair provides information about the bypass 748 /// block and the end value on the edge from bypass to this loop. 749 void createInductionResumeValues( 750 Loop *L, Value *VectorTripCount, 751 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 752 753 /// Complete the loop skeleton by adding debug MDs, creating appropriate 754 /// conditional branches in the middle block, preparing the builder and 755 /// running the verifier. Take in the vector loop \p L as argument, and return 756 /// the preheader of the completed vector loop. 757 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 758 759 /// Add additional metadata to \p To that was not present on \p Orig. 760 /// 761 /// Currently this is used to add the noalias annotations based on the 762 /// inserted memchecks. Use this for instructions that are *cloned* into the 763 /// vector loop. 764 void addNewMetadata(Instruction *To, const Instruction *Orig); 765 766 /// Add metadata from one instruction to another. 767 /// 768 /// This includes both the original MDs from \p From and additional ones (\see 769 /// addNewMetadata). Use this for *newly created* instructions in the vector 770 /// loop. 771 void addMetadata(Instruction *To, Instruction *From); 772 773 /// Similar to the previous function but it adds the metadata to a 774 /// vector of instructions. 775 void addMetadata(ArrayRef<Value *> To, Instruction *From); 776 777 /// Allow subclasses to override and print debug traces before/after vplan 778 /// execution, when trace information is requested. 779 virtual void printDebugTracesAtStart(){}; 780 virtual void printDebugTracesAtEnd(){}; 781 782 /// The original loop. 783 Loop *OrigLoop; 784 785 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 786 /// dynamic knowledge to simplify SCEV expressions and converts them to a 787 /// more usable form. 788 PredicatedScalarEvolution &PSE; 789 790 /// Loop Info. 791 LoopInfo *LI; 792 793 /// Dominator Tree. 794 DominatorTree *DT; 795 796 /// Alias Analysis. 797 AAResults *AA; 798 799 /// Target Library Info. 800 const TargetLibraryInfo *TLI; 801 802 /// Target Transform Info. 803 const TargetTransformInfo *TTI; 804 805 /// Assumption Cache. 806 AssumptionCache *AC; 807 808 /// Interface to emit optimization remarks. 809 OptimizationRemarkEmitter *ORE; 810 811 /// LoopVersioning. It's only set up (non-null) if memchecks were 812 /// used. 813 /// 814 /// This is currently only used to add no-alias metadata based on the 815 /// memchecks. The actually versioning is performed manually. 816 std::unique_ptr<LoopVersioning> LVer; 817 818 /// The vectorization SIMD factor to use. Each vector will have this many 819 /// vector elements. 820 ElementCount VF; 821 822 /// The vectorization unroll factor to use. Each scalar is vectorized to this 823 /// many different vector instructions. 824 unsigned UF; 825 826 /// The builder that we use 827 IRBuilder<> Builder; 828 829 // --- Vectorization state --- 830 831 /// The vector-loop preheader. 832 BasicBlock *LoopVectorPreHeader; 833 834 /// The scalar-loop preheader. 835 BasicBlock *LoopScalarPreHeader; 836 837 /// Middle Block between the vector and the scalar. 838 BasicBlock *LoopMiddleBlock; 839 840 /// The (unique) ExitBlock of the scalar loop. Note that 841 /// there can be multiple exiting edges reaching this block. 842 BasicBlock *LoopExitBlock; 843 844 /// The vector loop body. 845 BasicBlock *LoopVectorBody; 846 847 /// The scalar loop body. 848 BasicBlock *LoopScalarBody; 849 850 /// A list of all bypass blocks. The first block is the entry of the loop. 851 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 852 853 /// The new Induction variable which was added to the new block. 854 PHINode *Induction = nullptr; 855 856 /// The induction variable of the old basic block. 857 PHINode *OldInduction = nullptr; 858 859 /// Maps values from the original loop to their corresponding values in the 860 /// vectorized loop. A key value can map to either vector values, scalar 861 /// values or both kinds of values, depending on whether the key was 862 /// vectorized and scalarized. 863 VectorizerValueMap VectorLoopValueMap; 864 865 /// Store instructions that were predicated. 866 SmallVector<Instruction *, 4> PredicatedInstructions; 867 868 /// Trip count of the original loop. 869 Value *TripCount = nullptr; 870 871 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 872 Value *VectorTripCount = nullptr; 873 874 /// The legality analysis. 875 LoopVectorizationLegality *Legal; 876 877 /// The profitablity analysis. 878 LoopVectorizationCostModel *Cost; 879 880 // Record whether runtime checks are added. 881 bool AddedSafetyChecks = false; 882 883 // Holds the end values for each induction variable. We save the end values 884 // so we can later fix-up the external users of the induction variables. 885 DenseMap<PHINode *, Value *> IVEndValues; 886 887 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 888 // fixed up at the end of vector code generation. 889 SmallVector<PHINode *, 8> OrigPHIsToFix; 890 891 /// BFI and PSI are used to check for profile guided size optimizations. 892 BlockFrequencyInfo *BFI; 893 ProfileSummaryInfo *PSI; 894 895 // Whether this loop should be optimized for size based on profile guided size 896 // optimizatios. 897 bool OptForSizeBasedOnProfile; 898 }; 899 900 class InnerLoopUnroller : public InnerLoopVectorizer { 901 public: 902 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 903 LoopInfo *LI, DominatorTree *DT, 904 const TargetLibraryInfo *TLI, 905 const TargetTransformInfo *TTI, AssumptionCache *AC, 906 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 907 LoopVectorizationLegality *LVL, 908 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 909 ProfileSummaryInfo *PSI) 910 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 911 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 912 BFI, PSI) {} 913 914 private: 915 Value *getBroadcastInstrs(Value *V) override; 916 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 917 Instruction::BinaryOps Opcode = 918 Instruction::BinaryOpsEnd) override; 919 Value *reverseVector(Value *Vec) override; 920 }; 921 922 /// Encapsulate information regarding vectorization of a loop and its epilogue. 923 /// This information is meant to be updated and used across two stages of 924 /// epilogue vectorization. 925 struct EpilogueLoopVectorizationInfo { 926 ElementCount MainLoopVF = ElementCount::getFixed(0); 927 unsigned MainLoopUF = 0; 928 ElementCount EpilogueVF = ElementCount::getFixed(0); 929 unsigned EpilogueUF = 0; 930 BasicBlock *MainLoopIterationCountCheck = nullptr; 931 BasicBlock *EpilogueIterationCountCheck = nullptr; 932 BasicBlock *SCEVSafetyCheck = nullptr; 933 BasicBlock *MemSafetyCheck = nullptr; 934 Value *TripCount = nullptr; 935 Value *VectorTripCount = nullptr; 936 937 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 938 unsigned EUF) 939 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 940 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 941 assert(EUF == 1 && 942 "A high UF for the epilogue loop is likely not beneficial."); 943 } 944 }; 945 946 /// An extension of the inner loop vectorizer that creates a skeleton for a 947 /// vectorized loop that has its epilogue (residual) also vectorized. 948 /// The idea is to run the vplan on a given loop twice, firstly to setup the 949 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 950 /// from the first step and vectorize the epilogue. This is achieved by 951 /// deriving two concrete strategy classes from this base class and invoking 952 /// them in succession from the loop vectorizer planner. 953 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 954 public: 955 InnerLoopAndEpilogueVectorizer( 956 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 957 DominatorTree *DT, const TargetLibraryInfo *TLI, 958 const TargetTransformInfo *TTI, AssumptionCache *AC, 959 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 960 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 961 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 962 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 963 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 964 EPI(EPI) {} 965 966 // Override this function to handle the more complex control flow around the 967 // three loops. 968 BasicBlock *createVectorizedLoopSkeleton() final override { 969 return createEpilogueVectorizedLoopSkeleton(); 970 } 971 972 /// The interface for creating a vectorized skeleton using one of two 973 /// different strategies, each corresponding to one execution of the vplan 974 /// as described above. 975 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 976 977 /// Holds and updates state information required to vectorize the main loop 978 /// and its epilogue in two separate passes. This setup helps us avoid 979 /// regenerating and recomputing runtime safety checks. It also helps us to 980 /// shorten the iteration-count-check path length for the cases where the 981 /// iteration count of the loop is so small that the main vector loop is 982 /// completely skipped. 983 EpilogueLoopVectorizationInfo &EPI; 984 }; 985 986 /// A specialized derived class of inner loop vectorizer that performs 987 /// vectorization of *main* loops in the process of vectorizing loops and their 988 /// epilogues. 989 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 990 public: 991 EpilogueVectorizerMainLoop( 992 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 993 DominatorTree *DT, const TargetLibraryInfo *TLI, 994 const TargetTransformInfo *TTI, AssumptionCache *AC, 995 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 996 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 997 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 998 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 999 EPI, LVL, CM, BFI, PSI) {} 1000 /// Implements the interface for creating a vectorized skeleton using the 1001 /// *main loop* strategy (ie the first pass of vplan execution). 1002 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1003 1004 protected: 1005 /// Emits an iteration count bypass check once for the main loop (when \p 1006 /// ForEpilogue is false) and once for the epilogue loop (when \p 1007 /// ForEpilogue is true). 1008 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1009 bool ForEpilogue); 1010 void printDebugTracesAtStart() override; 1011 void printDebugTracesAtEnd() override; 1012 }; 1013 1014 // A specialized derived class of inner loop vectorizer that performs 1015 // vectorization of *epilogue* loops in the process of vectorizing loops and 1016 // their epilogues. 1017 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1018 public: 1019 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1020 LoopInfo *LI, DominatorTree *DT, 1021 const TargetLibraryInfo *TLI, 1022 const TargetTransformInfo *TTI, AssumptionCache *AC, 1023 OptimizationRemarkEmitter *ORE, 1024 EpilogueLoopVectorizationInfo &EPI, 1025 LoopVectorizationLegality *LVL, 1026 llvm::LoopVectorizationCostModel *CM, 1027 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1028 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1029 EPI, LVL, CM, BFI, PSI) {} 1030 /// Implements the interface for creating a vectorized skeleton using the 1031 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1032 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1033 1034 protected: 1035 /// Emits an iteration count bypass check after the main vector loop has 1036 /// finished to see if there are any iterations left to execute by either 1037 /// the vector epilogue or the scalar epilogue. 1038 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1039 BasicBlock *Bypass, 1040 BasicBlock *Insert); 1041 void printDebugTracesAtStart() override; 1042 void printDebugTracesAtEnd() override; 1043 }; 1044 } // end namespace llvm 1045 1046 /// Look for a meaningful debug location on the instruction or it's 1047 /// operands. 1048 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1049 if (!I) 1050 return I; 1051 1052 DebugLoc Empty; 1053 if (I->getDebugLoc() != Empty) 1054 return I; 1055 1056 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1057 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1058 if (OpInst->getDebugLoc() != Empty) 1059 return OpInst; 1060 } 1061 1062 return I; 1063 } 1064 1065 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1066 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1067 const DILocation *DIL = Inst->getDebugLoc(); 1068 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1069 !isa<DbgInfoIntrinsic>(Inst)) { 1070 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1071 auto NewDIL = 1072 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1073 if (NewDIL) 1074 B.SetCurrentDebugLocation(NewDIL.getValue()); 1075 else 1076 LLVM_DEBUG(dbgs() 1077 << "Failed to create new discriminator: " 1078 << DIL->getFilename() << " Line: " << DIL->getLine()); 1079 } 1080 else 1081 B.SetCurrentDebugLocation(DIL); 1082 } else 1083 B.SetCurrentDebugLocation(DebugLoc()); 1084 } 1085 1086 /// Write a record \p DebugMsg about vectorization failure to the debug 1087 /// output stream. If \p I is passed, it is an instruction that prevents 1088 /// vectorization. 1089 #ifndef NDEBUG 1090 static void debugVectorizationFailure(const StringRef DebugMsg, 1091 Instruction *I) { 1092 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1093 if (I != nullptr) 1094 dbgs() << " " << *I; 1095 else 1096 dbgs() << '.'; 1097 dbgs() << '\n'; 1098 } 1099 #endif 1100 1101 /// Create an analysis remark that explains why vectorization failed 1102 /// 1103 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1104 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1105 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1106 /// the location of the remark. \return the remark object that can be 1107 /// streamed to. 1108 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1109 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1110 Value *CodeRegion = TheLoop->getHeader(); 1111 DebugLoc DL = TheLoop->getStartLoc(); 1112 1113 if (I) { 1114 CodeRegion = I->getParent(); 1115 // If there is no debug location attached to the instruction, revert back to 1116 // using the loop's. 1117 if (I->getDebugLoc()) 1118 DL = I->getDebugLoc(); 1119 } 1120 1121 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1122 R << "loop not vectorized: "; 1123 return R; 1124 } 1125 1126 /// Return a value for Step multiplied by VF. 1127 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1128 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1129 Constant *StepVal = ConstantInt::get( 1130 Step->getType(), 1131 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1132 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1133 } 1134 1135 namespace llvm { 1136 1137 void reportVectorizationFailure(const StringRef DebugMsg, 1138 const StringRef OREMsg, const StringRef ORETag, 1139 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1140 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1141 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1142 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1143 ORETag, TheLoop, I) << OREMsg); 1144 } 1145 1146 } // end namespace llvm 1147 1148 #ifndef NDEBUG 1149 /// \return string containing a file name and a line # for the given loop. 1150 static std::string getDebugLocString(const Loop *L) { 1151 std::string Result; 1152 if (L) { 1153 raw_string_ostream OS(Result); 1154 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1155 LoopDbgLoc.print(OS); 1156 else 1157 // Just print the module name. 1158 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1159 OS.flush(); 1160 } 1161 return Result; 1162 } 1163 #endif 1164 1165 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1166 const Instruction *Orig) { 1167 // If the loop was versioned with memchecks, add the corresponding no-alias 1168 // metadata. 1169 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1170 LVer->annotateInstWithNoAlias(To, Orig); 1171 } 1172 1173 void InnerLoopVectorizer::addMetadata(Instruction *To, 1174 Instruction *From) { 1175 propagateMetadata(To, From); 1176 addNewMetadata(To, From); 1177 } 1178 1179 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1180 Instruction *From) { 1181 for (Value *V : To) { 1182 if (Instruction *I = dyn_cast<Instruction>(V)) 1183 addMetadata(I, From); 1184 } 1185 } 1186 1187 namespace llvm { 1188 1189 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1190 // lowered. 1191 enum ScalarEpilogueLowering { 1192 1193 // The default: allowing scalar epilogues. 1194 CM_ScalarEpilogueAllowed, 1195 1196 // Vectorization with OptForSize: don't allow epilogues. 1197 CM_ScalarEpilogueNotAllowedOptSize, 1198 1199 // A special case of vectorisation with OptForSize: loops with a very small 1200 // trip count are considered for vectorization under OptForSize, thereby 1201 // making sure the cost of their loop body is dominant, free of runtime 1202 // guards and scalar iteration overheads. 1203 CM_ScalarEpilogueNotAllowedLowTripLoop, 1204 1205 // Loop hint predicate indicating an epilogue is undesired. 1206 CM_ScalarEpilogueNotNeededUsePredicate, 1207 1208 // Directive indicating we must either tail fold or not vectorize 1209 CM_ScalarEpilogueNotAllowedUsePredicate 1210 }; 1211 1212 /// LoopVectorizationCostModel - estimates the expected speedups due to 1213 /// vectorization. 1214 /// In many cases vectorization is not profitable. This can happen because of 1215 /// a number of reasons. In this class we mainly attempt to predict the 1216 /// expected speedup/slowdowns due to the supported instruction set. We use the 1217 /// TargetTransformInfo to query the different backends for the cost of 1218 /// different operations. 1219 class LoopVectorizationCostModel { 1220 public: 1221 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1222 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1223 LoopVectorizationLegality *Legal, 1224 const TargetTransformInfo &TTI, 1225 const TargetLibraryInfo *TLI, DemandedBits *DB, 1226 AssumptionCache *AC, 1227 OptimizationRemarkEmitter *ORE, const Function *F, 1228 const LoopVectorizeHints *Hints, 1229 InterleavedAccessInfo &IAI) 1230 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1231 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1232 Hints(Hints), InterleaveInfo(IAI) {} 1233 1234 /// \return An upper bound for the vectorization factor, or None if 1235 /// vectorization and interleaving should be avoided up front. 1236 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1237 1238 /// \return True if runtime checks are required for vectorization, and false 1239 /// otherwise. 1240 bool runtimeChecksRequired(); 1241 1242 /// \return The most profitable vectorization factor and the cost of that VF. 1243 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1244 /// then this vectorization factor will be selected if vectorization is 1245 /// possible. 1246 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1247 VectorizationFactor 1248 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1249 const LoopVectorizationPlanner &LVP); 1250 1251 /// Setup cost-based decisions for user vectorization factor. 1252 void selectUserVectorizationFactor(ElementCount UserVF) { 1253 collectUniformsAndScalars(UserVF); 1254 collectInstsToScalarize(UserVF); 1255 } 1256 1257 /// \return The size (in bits) of the smallest and widest types in the code 1258 /// that needs to be vectorized. We ignore values that remain scalar such as 1259 /// 64 bit loop indices. 1260 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1261 1262 /// \return The desired interleave count. 1263 /// If interleave count has been specified by metadata it will be returned. 1264 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1265 /// are the selected vectorization factor and the cost of the selected VF. 1266 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1267 1268 /// Memory access instruction may be vectorized in more than one way. 1269 /// Form of instruction after vectorization depends on cost. 1270 /// This function takes cost-based decisions for Load/Store instructions 1271 /// and collects them in a map. This decisions map is used for building 1272 /// the lists of loop-uniform and loop-scalar instructions. 1273 /// The calculated cost is saved with widening decision in order to 1274 /// avoid redundant calculations. 1275 void setCostBasedWideningDecision(ElementCount VF); 1276 1277 /// A struct that represents some properties of the register usage 1278 /// of a loop. 1279 struct RegisterUsage { 1280 /// Holds the number of loop invariant values that are used in the loop. 1281 /// The key is ClassID of target-provided register class. 1282 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1283 /// Holds the maximum number of concurrent live intervals in the loop. 1284 /// The key is ClassID of target-provided register class. 1285 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1286 }; 1287 1288 /// \return Returns information about the register usages of the loop for the 1289 /// given vectorization factors. 1290 SmallVector<RegisterUsage, 8> 1291 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1292 1293 /// Collect values we want to ignore in the cost model. 1294 void collectValuesToIgnore(); 1295 1296 /// Split reductions into those that happen in the loop, and those that happen 1297 /// outside. In loop reductions are collected into InLoopReductionChains. 1298 void collectInLoopReductions(); 1299 1300 /// \returns The smallest bitwidth each instruction can be represented with. 1301 /// The vector equivalents of these instructions should be truncated to this 1302 /// type. 1303 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1304 return MinBWs; 1305 } 1306 1307 /// \returns True if it is more profitable to scalarize instruction \p I for 1308 /// vectorization factor \p VF. 1309 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1310 assert(VF.isVector() && 1311 "Profitable to scalarize relevant only for VF > 1."); 1312 1313 // Cost model is not run in the VPlan-native path - return conservative 1314 // result until this changes. 1315 if (EnableVPlanNativePath) 1316 return false; 1317 1318 auto Scalars = InstsToScalarize.find(VF); 1319 assert(Scalars != InstsToScalarize.end() && 1320 "VF not yet analyzed for scalarization profitability"); 1321 return Scalars->second.find(I) != Scalars->second.end(); 1322 } 1323 1324 /// Returns true if \p I is known to be uniform after vectorization. 1325 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1326 if (VF.isScalar()) 1327 return true; 1328 1329 // Cost model is not run in the VPlan-native path - return conservative 1330 // result until this changes. 1331 if (EnableVPlanNativePath) 1332 return false; 1333 1334 auto UniformsPerVF = Uniforms.find(VF); 1335 assert(UniformsPerVF != Uniforms.end() && 1336 "VF not yet analyzed for uniformity"); 1337 return UniformsPerVF->second.count(I); 1338 } 1339 1340 /// Returns true if \p I is known to be scalar after vectorization. 1341 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1342 if (VF.isScalar()) 1343 return true; 1344 1345 // Cost model is not run in the VPlan-native path - return conservative 1346 // result until this changes. 1347 if (EnableVPlanNativePath) 1348 return false; 1349 1350 auto ScalarsPerVF = Scalars.find(VF); 1351 assert(ScalarsPerVF != Scalars.end() && 1352 "Scalar values are not calculated for VF"); 1353 return ScalarsPerVF->second.count(I); 1354 } 1355 1356 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1357 /// for vectorization factor \p VF. 1358 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1359 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1360 !isProfitableToScalarize(I, VF) && 1361 !isScalarAfterVectorization(I, VF); 1362 } 1363 1364 /// Decision that was taken during cost calculation for memory instruction. 1365 enum InstWidening { 1366 CM_Unknown, 1367 CM_Widen, // For consecutive accesses with stride +1. 1368 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1369 CM_Interleave, 1370 CM_GatherScatter, 1371 CM_Scalarize 1372 }; 1373 1374 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1375 /// instruction \p I and vector width \p VF. 1376 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1377 unsigned Cost) { 1378 assert(VF.isVector() && "Expected VF >=2"); 1379 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1380 } 1381 1382 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1383 /// interleaving group \p Grp and vector width \p VF. 1384 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1385 ElementCount VF, InstWidening W, unsigned Cost) { 1386 assert(VF.isVector() && "Expected VF >=2"); 1387 /// Broadcast this decicion to all instructions inside the group. 1388 /// But the cost will be assigned to one instruction only. 1389 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1390 if (auto *I = Grp->getMember(i)) { 1391 if (Grp->getInsertPos() == I) 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1393 else 1394 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1395 } 1396 } 1397 } 1398 1399 /// Return the cost model decision for the given instruction \p I and vector 1400 /// width \p VF. Return CM_Unknown if this instruction did not pass 1401 /// through the cost modeling. 1402 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1403 assert(VF.isVector() && "Expected VF to be a vector VF"); 1404 // Cost model is not run in the VPlan-native path - return conservative 1405 // result until this changes. 1406 if (EnableVPlanNativePath) 1407 return CM_GatherScatter; 1408 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 auto Itr = WideningDecisions.find(InstOnVF); 1411 if (Itr == WideningDecisions.end()) 1412 return CM_Unknown; 1413 return Itr->second.first; 1414 } 1415 1416 /// Return the vectorization cost for the given instruction \p I and vector 1417 /// width \p VF. 1418 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1419 assert(VF.isVector() && "Expected VF >=2"); 1420 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1421 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1422 "The cost is not calculated"); 1423 return WideningDecisions[InstOnVF].second; 1424 } 1425 1426 /// Return True if instruction \p I is an optimizable truncate whose operand 1427 /// is an induction variable. Such a truncate will be removed by adding a new 1428 /// induction variable with the destination type. 1429 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1430 // If the instruction is not a truncate, return false. 1431 auto *Trunc = dyn_cast<TruncInst>(I); 1432 if (!Trunc) 1433 return false; 1434 1435 // Get the source and destination types of the truncate. 1436 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1437 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1438 1439 // If the truncate is free for the given types, return false. Replacing a 1440 // free truncate with an induction variable would add an induction variable 1441 // update instruction to each iteration of the loop. We exclude from this 1442 // check the primary induction variable since it will need an update 1443 // instruction regardless. 1444 Value *Op = Trunc->getOperand(0); 1445 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1446 return false; 1447 1448 // If the truncated value is not an induction variable, return false. 1449 return Legal->isInductionPhi(Op); 1450 } 1451 1452 /// Collects the instructions to scalarize for each predicated instruction in 1453 /// the loop. 1454 void collectInstsToScalarize(ElementCount VF); 1455 1456 /// Collect Uniform and Scalar values for the given \p VF. 1457 /// The sets depend on CM decision for Load/Store instructions 1458 /// that may be vectorized as interleave, gather-scatter or scalarized. 1459 void collectUniformsAndScalars(ElementCount VF) { 1460 // Do the analysis once. 1461 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1462 return; 1463 setCostBasedWideningDecision(VF); 1464 collectLoopUniforms(VF); 1465 collectLoopScalars(VF); 1466 } 1467 1468 /// Returns true if the target machine supports masked store operation 1469 /// for the given \p DataType and kind of access to \p Ptr. 1470 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1471 return Legal->isConsecutivePtr(Ptr) && 1472 TTI.isLegalMaskedStore(DataType, Alignment); 1473 } 1474 1475 /// Returns true if the target machine supports masked load operation 1476 /// for the given \p DataType and kind of access to \p Ptr. 1477 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1478 return Legal->isConsecutivePtr(Ptr) && 1479 TTI.isLegalMaskedLoad(DataType, Alignment); 1480 } 1481 1482 /// Returns true if the target machine supports masked scatter operation 1483 /// for the given \p DataType. 1484 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1485 return TTI.isLegalMaskedScatter(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked gather operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1491 return TTI.isLegalMaskedGather(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine can represent \p V as a masked gather 1495 /// or scatter operation. 1496 bool isLegalGatherOrScatter(Value *V) { 1497 bool LI = isa<LoadInst>(V); 1498 bool SI = isa<StoreInst>(V); 1499 if (!LI && !SI) 1500 return false; 1501 auto *Ty = getMemInstValueType(V); 1502 Align Align = getLoadStoreAlignment(V); 1503 return (LI && isLegalMaskedGather(Ty, Align)) || 1504 (SI && isLegalMaskedScatter(Ty, Align)); 1505 } 1506 1507 /// Returns true if \p I is an instruction that will be scalarized with 1508 /// predication. Such instructions include conditional stores and 1509 /// instructions that may divide by zero. 1510 /// If a non-zero VF has been calculated, we check if I will be scalarized 1511 /// predication for that VF. 1512 bool isScalarWithPredication(Instruction *I, 1513 ElementCount VF = ElementCount::getFixed(1)); 1514 1515 // Returns true if \p I is an instruction that will be predicated either 1516 // through scalar predication or masked load/store or masked gather/scatter. 1517 // Superset of instructions that return true for isScalarWithPredication. 1518 bool isPredicatedInst(Instruction *I) { 1519 if (!blockNeedsPredication(I->getParent())) 1520 return false; 1521 // Loads and stores that need some form of masked operation are predicated 1522 // instructions. 1523 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1524 return Legal->isMaskRequired(I); 1525 return isScalarWithPredication(I); 1526 } 1527 1528 /// Returns true if \p I is a memory instruction with consecutive memory 1529 /// access that can be widened. 1530 bool 1531 memoryInstructionCanBeWidened(Instruction *I, 1532 ElementCount VF = ElementCount::getFixed(1)); 1533 1534 /// Returns true if \p I is a memory instruction in an interleaved-group 1535 /// of memory accesses that can be vectorized with wide vector loads/stores 1536 /// and shuffles. 1537 bool 1538 interleavedAccessCanBeWidened(Instruction *I, 1539 ElementCount VF = ElementCount::getFixed(1)); 1540 1541 /// Check if \p Instr belongs to any interleaved access group. 1542 bool isAccessInterleaved(Instruction *Instr) { 1543 return InterleaveInfo.isInterleaved(Instr); 1544 } 1545 1546 /// Get the interleaved access group that \p Instr belongs to. 1547 const InterleaveGroup<Instruction> * 1548 getInterleavedAccessGroup(Instruction *Instr) { 1549 return InterleaveInfo.getInterleaveGroup(Instr); 1550 } 1551 1552 /// Returns true if we're required to use a scalar epilogue for at least 1553 /// the final iteration of the original loop. 1554 bool requiresScalarEpilogue() const { 1555 if (!isScalarEpilogueAllowed()) 1556 return false; 1557 // If we might exit from anywhere but the latch, must run the exiting 1558 // iteration in scalar form. 1559 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1560 return true; 1561 return InterleaveInfo.requiresScalarEpilogue(); 1562 } 1563 1564 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1565 /// loop hint annotation. 1566 bool isScalarEpilogueAllowed() const { 1567 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1568 } 1569 1570 /// Returns true if all loop blocks should be masked to fold tail loop. 1571 bool foldTailByMasking() const { return FoldTailByMasking; } 1572 1573 bool blockNeedsPredication(BasicBlock *BB) { 1574 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1575 } 1576 1577 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1578 /// nodes to the chain of instructions representing the reductions. Uses a 1579 /// MapVector to ensure deterministic iteration order. 1580 using ReductionChainMap = 1581 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1582 1583 /// Return the chain of instructions representing an inloop reduction. 1584 const ReductionChainMap &getInLoopReductionChains() const { 1585 return InLoopReductionChains; 1586 } 1587 1588 /// Returns true if the Phi is part of an inloop reduction. 1589 bool isInLoopReduction(PHINode *Phi) const { 1590 return InLoopReductionChains.count(Phi); 1591 } 1592 1593 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1594 /// with factor VF. Return the cost of the instruction, including 1595 /// scalarization overhead if it's needed. 1596 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1597 1598 /// Estimate cost of a call instruction CI if it were vectorized with factor 1599 /// VF. Return the cost of the instruction, including scalarization overhead 1600 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1601 /// scalarized - 1602 /// i.e. either vector version isn't available, or is too expensive. 1603 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1604 bool &NeedToScalarize); 1605 1606 /// Invalidates decisions already taken by the cost model. 1607 void invalidateCostModelingDecisions() { 1608 WideningDecisions.clear(); 1609 Uniforms.clear(); 1610 Scalars.clear(); 1611 } 1612 1613 private: 1614 unsigned NumPredStores = 0; 1615 1616 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1617 /// than zero. One is returned if vectorization should best be avoided due 1618 /// to cost. 1619 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1620 ElementCount UserVF); 1621 1622 /// The vectorization cost is a combination of the cost itself and a boolean 1623 /// indicating whether any of the contributing operations will actually 1624 /// operate on 1625 /// vector values after type legalization in the backend. If this latter value 1626 /// is 1627 /// false, then all operations will be scalarized (i.e. no vectorization has 1628 /// actually taken place). 1629 using VectorizationCostTy = std::pair<unsigned, bool>; 1630 1631 /// Returns the expected execution cost. The unit of the cost does 1632 /// not matter because we use the 'cost' units to compare different 1633 /// vector widths. The cost that is returned is *not* normalized by 1634 /// the factor width. 1635 VectorizationCostTy expectedCost(ElementCount VF); 1636 1637 /// Returns the execution time cost of an instruction for a given vector 1638 /// width. Vector width of one means scalar. 1639 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost-computation logic from getInstructionCost which provides 1642 /// the vector type as an output parameter. 1643 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1644 1645 /// Calculate vectorization cost of memory instruction \p I. 1646 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1647 1648 /// The cost computation for scalarized memory instruction. 1649 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost computation for interleaving group of memory instructions. 1652 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1653 1654 /// The cost computation for Gather/Scatter instruction. 1655 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1656 1657 /// The cost computation for widening instruction \p I with consecutive 1658 /// memory access. 1659 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1662 /// Load: scalar load + broadcast. 1663 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1664 /// element) 1665 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1666 1667 /// Estimate the overhead of scalarizing an instruction. This is a 1668 /// convenience wrapper for the type-based getScalarizationOverhead API. 1669 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1670 1671 /// Returns whether the instruction is a load or store and will be a emitted 1672 /// as a vector operation. 1673 bool isConsecutiveLoadOrStore(Instruction *I); 1674 1675 /// Returns true if an artificially high cost for emulated masked memrefs 1676 /// should be used. 1677 bool useEmulatedMaskMemRefHack(Instruction *I); 1678 1679 /// Map of scalar integer values to the smallest bitwidth they can be legally 1680 /// represented as. The vector equivalents of these values should be truncated 1681 /// to this type. 1682 MapVector<Instruction *, uint64_t> MinBWs; 1683 1684 /// A type representing the costs for instructions if they were to be 1685 /// scalarized rather than vectorized. The entries are Instruction-Cost 1686 /// pairs. 1687 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1688 1689 /// A set containing all BasicBlocks that are known to present after 1690 /// vectorization as a predicated block. 1691 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1692 1693 /// Records whether it is allowed to have the original scalar loop execute at 1694 /// least once. This may be needed as a fallback loop in case runtime 1695 /// aliasing/dependence checks fail, or to handle the tail/remainder 1696 /// iterations when the trip count is unknown or doesn't divide by the VF, 1697 /// or as a peel-loop to handle gaps in interleave-groups. 1698 /// Under optsize and when the trip count is very small we don't allow any 1699 /// iterations to execute in the scalar loop. 1700 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1701 1702 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1703 bool FoldTailByMasking = false; 1704 1705 /// A map holding scalar costs for different vectorization factors. The 1706 /// presence of a cost for an instruction in the mapping indicates that the 1707 /// instruction will be scalarized when vectorizing with the associated 1708 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1709 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1710 1711 /// Holds the instructions known to be uniform after vectorization. 1712 /// The data is collected per VF. 1713 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1714 1715 /// Holds the instructions known to be scalar after vectorization. 1716 /// The data is collected per VF. 1717 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1718 1719 /// Holds the instructions (address computations) that are forced to be 1720 /// scalarized. 1721 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1722 1723 /// PHINodes of the reductions that should be expanded in-loop along with 1724 /// their associated chains of reduction operations, in program order from top 1725 /// (PHI) to bottom 1726 ReductionChainMap InLoopReductionChains; 1727 1728 /// Returns the expected difference in cost from scalarizing the expression 1729 /// feeding a predicated instruction \p PredInst. The instructions to 1730 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1731 /// non-negative return value implies the expression will be scalarized. 1732 /// Currently, only single-use chains are considered for scalarization. 1733 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1734 ElementCount VF); 1735 1736 /// Collect the instructions that are uniform after vectorization. An 1737 /// instruction is uniform if we represent it with a single scalar value in 1738 /// the vectorized loop corresponding to each vector iteration. Examples of 1739 /// uniform instructions include pointer operands of consecutive or 1740 /// interleaved memory accesses. Note that although uniformity implies an 1741 /// instruction will be scalar, the reverse is not true. In general, a 1742 /// scalarized instruction will be represented by VF scalar values in the 1743 /// vectorized loop, each corresponding to an iteration of the original 1744 /// scalar loop. 1745 void collectLoopUniforms(ElementCount VF); 1746 1747 /// Collect the instructions that are scalar after vectorization. An 1748 /// instruction is scalar if it is known to be uniform or will be scalarized 1749 /// during vectorization. Non-uniform scalarized instructions will be 1750 /// represented by VF values in the vectorized loop, each corresponding to an 1751 /// iteration of the original scalar loop. 1752 void collectLoopScalars(ElementCount VF); 1753 1754 /// Keeps cost model vectorization decision and cost for instructions. 1755 /// Right now it is used for memory instructions only. 1756 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1757 std::pair<InstWidening, unsigned>>; 1758 1759 DecisionList WideningDecisions; 1760 1761 /// Returns true if \p V is expected to be vectorized and it needs to be 1762 /// extracted. 1763 bool needsExtract(Value *V, ElementCount VF) const { 1764 Instruction *I = dyn_cast<Instruction>(V); 1765 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1766 TheLoop->isLoopInvariant(I)) 1767 return false; 1768 1769 // Assume we can vectorize V (and hence we need extraction) if the 1770 // scalars are not computed yet. This can happen, because it is called 1771 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1772 // the scalars are collected. That should be a safe assumption in most 1773 // cases, because we check if the operands have vectorizable types 1774 // beforehand in LoopVectorizationLegality. 1775 return Scalars.find(VF) == Scalars.end() || 1776 !isScalarAfterVectorization(I, VF); 1777 }; 1778 1779 /// Returns a range containing only operands needing to be extracted. 1780 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1781 ElementCount VF) { 1782 return SmallVector<Value *, 4>(make_filter_range( 1783 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1784 } 1785 1786 /// Determines if we have the infrastructure to vectorize loop \p L and its 1787 /// epilogue, assuming the main loop is vectorized by \p VF. 1788 bool isCandidateForEpilogueVectorization(const Loop &L, 1789 const ElementCount VF) const; 1790 1791 /// Returns true if epilogue vectorization is considered profitable, and 1792 /// false otherwise. 1793 /// \p VF is the vectorization factor chosen for the original loop. 1794 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1795 1796 public: 1797 /// The loop that we evaluate. 1798 Loop *TheLoop; 1799 1800 /// Predicated scalar evolution analysis. 1801 PredicatedScalarEvolution &PSE; 1802 1803 /// Loop Info analysis. 1804 LoopInfo *LI; 1805 1806 /// Vectorization legality. 1807 LoopVectorizationLegality *Legal; 1808 1809 /// Vector target information. 1810 const TargetTransformInfo &TTI; 1811 1812 /// Target Library Info. 1813 const TargetLibraryInfo *TLI; 1814 1815 /// Demanded bits analysis. 1816 DemandedBits *DB; 1817 1818 /// Assumption cache. 1819 AssumptionCache *AC; 1820 1821 /// Interface to emit optimization remarks. 1822 OptimizationRemarkEmitter *ORE; 1823 1824 const Function *TheFunction; 1825 1826 /// Loop Vectorize Hint. 1827 const LoopVectorizeHints *Hints; 1828 1829 /// The interleave access information contains groups of interleaved accesses 1830 /// with the same stride and close to each other. 1831 InterleavedAccessInfo &InterleaveInfo; 1832 1833 /// Values to ignore in the cost model. 1834 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1835 1836 /// Values to ignore in the cost model when VF > 1. 1837 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1838 1839 /// Profitable vector factors. 1840 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1841 }; 1842 1843 } // end namespace llvm 1844 1845 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1846 // vectorization. The loop needs to be annotated with #pragma omp simd 1847 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1848 // vector length information is not provided, vectorization is not considered 1849 // explicit. Interleave hints are not allowed either. These limitations will be 1850 // relaxed in the future. 1851 // Please, note that we are currently forced to abuse the pragma 'clang 1852 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1853 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1854 // provides *explicit vectorization hints* (LV can bypass legal checks and 1855 // assume that vectorization is legal). However, both hints are implemented 1856 // using the same metadata (llvm.loop.vectorize, processed by 1857 // LoopVectorizeHints). This will be fixed in the future when the native IR 1858 // representation for pragma 'omp simd' is introduced. 1859 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1860 OptimizationRemarkEmitter *ORE) { 1861 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1862 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1863 1864 // Only outer loops with an explicit vectorization hint are supported. 1865 // Unannotated outer loops are ignored. 1866 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1867 return false; 1868 1869 Function *Fn = OuterLp->getHeader()->getParent(); 1870 if (!Hints.allowVectorization(Fn, OuterLp, 1871 true /*VectorizeOnlyWhenForced*/)) { 1872 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1873 return false; 1874 } 1875 1876 if (Hints.getInterleave() > 1) { 1877 // TODO: Interleave support is future work. 1878 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1879 "outer loops.\n"); 1880 Hints.emitRemarkWithHints(); 1881 return false; 1882 } 1883 1884 return true; 1885 } 1886 1887 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1888 OptimizationRemarkEmitter *ORE, 1889 SmallVectorImpl<Loop *> &V) { 1890 // Collect inner loops and outer loops without irreducible control flow. For 1891 // now, only collect outer loops that have explicit vectorization hints. If we 1892 // are stress testing the VPlan H-CFG construction, we collect the outermost 1893 // loop of every loop nest. 1894 if (L.isInnermost() || VPlanBuildStressTest || 1895 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1896 LoopBlocksRPO RPOT(&L); 1897 RPOT.perform(LI); 1898 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1899 V.push_back(&L); 1900 // TODO: Collect inner loops inside marked outer loops in case 1901 // vectorization fails for the outer loop. Do not invoke 1902 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1903 // already known to be reducible. We can use an inherited attribute for 1904 // that. 1905 return; 1906 } 1907 } 1908 for (Loop *InnerL : L) 1909 collectSupportedLoops(*InnerL, LI, ORE, V); 1910 } 1911 1912 namespace { 1913 1914 /// The LoopVectorize Pass. 1915 struct LoopVectorize : public FunctionPass { 1916 /// Pass identification, replacement for typeid 1917 static char ID; 1918 1919 LoopVectorizePass Impl; 1920 1921 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1922 bool VectorizeOnlyWhenForced = false) 1923 : FunctionPass(ID), 1924 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1925 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1926 } 1927 1928 bool runOnFunction(Function &F) override { 1929 if (skipFunction(F)) 1930 return false; 1931 1932 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1933 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1934 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1935 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1936 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1937 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1938 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1939 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1940 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1941 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1942 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1943 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1944 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1945 1946 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1947 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1948 1949 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1950 GetLAA, *ORE, PSI).MadeAnyChange; 1951 } 1952 1953 void getAnalysisUsage(AnalysisUsage &AU) const override { 1954 AU.addRequired<AssumptionCacheTracker>(); 1955 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1956 AU.addRequired<DominatorTreeWrapperPass>(); 1957 AU.addRequired<LoopInfoWrapperPass>(); 1958 AU.addRequired<ScalarEvolutionWrapperPass>(); 1959 AU.addRequired<TargetTransformInfoWrapperPass>(); 1960 AU.addRequired<AAResultsWrapperPass>(); 1961 AU.addRequired<LoopAccessLegacyAnalysis>(); 1962 AU.addRequired<DemandedBitsWrapperPass>(); 1963 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1964 AU.addRequired<InjectTLIMappingsLegacy>(); 1965 1966 // We currently do not preserve loopinfo/dominator analyses with outer loop 1967 // vectorization. Until this is addressed, mark these analyses as preserved 1968 // only for non-VPlan-native path. 1969 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1970 if (!EnableVPlanNativePath) { 1971 AU.addPreserved<LoopInfoWrapperPass>(); 1972 AU.addPreserved<DominatorTreeWrapperPass>(); 1973 } 1974 1975 AU.addPreserved<BasicAAWrapperPass>(); 1976 AU.addPreserved<GlobalsAAWrapperPass>(); 1977 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1978 } 1979 }; 1980 1981 } // end anonymous namespace 1982 1983 //===----------------------------------------------------------------------===// 1984 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1985 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1986 //===----------------------------------------------------------------------===// 1987 1988 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1989 // We need to place the broadcast of invariant variables outside the loop, 1990 // but only if it's proven safe to do so. Else, broadcast will be inside 1991 // vector loop body. 1992 Instruction *Instr = dyn_cast<Instruction>(V); 1993 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1994 (!Instr || 1995 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1996 // Place the code for broadcasting invariant variables in the new preheader. 1997 IRBuilder<>::InsertPointGuard Guard(Builder); 1998 if (SafeToHoist) 1999 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2000 2001 // Broadcast the scalar into all locations in the vector. 2002 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2003 2004 return Shuf; 2005 } 2006 2007 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2008 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 2009 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2010 "Expected either an induction phi-node or a truncate of it!"); 2011 Value *Start = II.getStartValue(); 2012 2013 // Construct the initial value of the vector IV in the vector loop preheader 2014 auto CurrIP = Builder.saveIP(); 2015 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2016 if (isa<TruncInst>(EntryVal)) { 2017 assert(Start->getType()->isIntegerTy() && 2018 "Truncation requires an integer type"); 2019 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2020 Step = Builder.CreateTrunc(Step, TruncType); 2021 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2022 } 2023 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2024 Value *SteppedStart = 2025 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2026 2027 // We create vector phi nodes for both integer and floating-point induction 2028 // variables. Here, we determine the kind of arithmetic we will perform. 2029 Instruction::BinaryOps AddOp; 2030 Instruction::BinaryOps MulOp; 2031 if (Step->getType()->isIntegerTy()) { 2032 AddOp = Instruction::Add; 2033 MulOp = Instruction::Mul; 2034 } else { 2035 AddOp = II.getInductionOpcode(); 2036 MulOp = Instruction::FMul; 2037 } 2038 2039 // Multiply the vectorization factor by the step using integer or 2040 // floating-point arithmetic as appropriate. 2041 Value *ConstVF = 2042 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2043 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2044 2045 // Create a vector splat to use in the induction update. 2046 // 2047 // FIXME: If the step is non-constant, we create the vector splat with 2048 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2049 // handle a constant vector splat. 2050 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2051 Value *SplatVF = isa<Constant>(Mul) 2052 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2053 : Builder.CreateVectorSplat(VF, Mul); 2054 Builder.restoreIP(CurrIP); 2055 2056 // We may need to add the step a number of times, depending on the unroll 2057 // factor. The last of those goes into the PHI. 2058 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2059 &*LoopVectorBody->getFirstInsertionPt()); 2060 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2061 Instruction *LastInduction = VecInd; 2062 for (unsigned Part = 0; Part < UF; ++Part) { 2063 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2064 2065 if (isa<TruncInst>(EntryVal)) 2066 addMetadata(LastInduction, EntryVal); 2067 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2068 2069 LastInduction = cast<Instruction>(addFastMathFlag( 2070 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2071 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2072 } 2073 2074 // Move the last step to the end of the latch block. This ensures consistent 2075 // placement of all induction updates. 2076 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2077 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2078 auto *ICmp = cast<Instruction>(Br->getCondition()); 2079 LastInduction->moveBefore(ICmp); 2080 LastInduction->setName("vec.ind.next"); 2081 2082 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2083 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2084 } 2085 2086 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2087 return Cost->isScalarAfterVectorization(I, VF) || 2088 Cost->isProfitableToScalarize(I, VF); 2089 } 2090 2091 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2092 if (shouldScalarizeInstruction(IV)) 2093 return true; 2094 auto isScalarInst = [&](User *U) -> bool { 2095 auto *I = cast<Instruction>(U); 2096 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2097 }; 2098 return llvm::any_of(IV->users(), isScalarInst); 2099 } 2100 2101 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2102 const InductionDescriptor &ID, const Instruction *EntryVal, 2103 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2104 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2105 "Expected either an induction phi-node or a truncate of it!"); 2106 2107 // This induction variable is not the phi from the original loop but the 2108 // newly-created IV based on the proof that casted Phi is equal to the 2109 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2110 // re-uses the same InductionDescriptor that original IV uses but we don't 2111 // have to do any recording in this case - that is done when original IV is 2112 // processed. 2113 if (isa<TruncInst>(EntryVal)) 2114 return; 2115 2116 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2117 if (Casts.empty()) 2118 return; 2119 // Only the first Cast instruction in the Casts vector is of interest. 2120 // The rest of the Casts (if exist) have no uses outside the 2121 // induction update chain itself. 2122 Instruction *CastInst = *Casts.begin(); 2123 if (Lane < UINT_MAX) 2124 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2125 else 2126 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2127 } 2128 2129 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 2130 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2131 "Primary induction variable must have an integer type"); 2132 2133 auto II = Legal->getInductionVars().find(IV); 2134 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2135 2136 auto ID = II->second; 2137 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2138 2139 // The value from the original loop to which we are mapping the new induction 2140 // variable. 2141 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2142 2143 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2144 2145 // Generate code for the induction step. Note that induction steps are 2146 // required to be loop-invariant 2147 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2148 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2149 "Induction step should be loop invariant"); 2150 if (PSE.getSE()->isSCEVable(IV->getType())) { 2151 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2152 return Exp.expandCodeFor(Step, Step->getType(), 2153 LoopVectorPreHeader->getTerminator()); 2154 } 2155 return cast<SCEVUnknown>(Step)->getValue(); 2156 }; 2157 2158 // The scalar value to broadcast. This is derived from the canonical 2159 // induction variable. If a truncation type is given, truncate the canonical 2160 // induction variable and step. Otherwise, derive these values from the 2161 // induction descriptor. 2162 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2163 Value *ScalarIV = Induction; 2164 if (IV != OldInduction) { 2165 ScalarIV = IV->getType()->isIntegerTy() 2166 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2167 : Builder.CreateCast(Instruction::SIToFP, Induction, 2168 IV->getType()); 2169 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2170 ScalarIV->setName("offset.idx"); 2171 } 2172 if (Trunc) { 2173 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2174 assert(Step->getType()->isIntegerTy() && 2175 "Truncation requires an integer step"); 2176 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2177 Step = Builder.CreateTrunc(Step, TruncType); 2178 } 2179 return ScalarIV; 2180 }; 2181 2182 // Create the vector values from the scalar IV, in the absence of creating a 2183 // vector IV. 2184 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2185 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2186 for (unsigned Part = 0; Part < UF; ++Part) { 2187 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2188 Value *EntryPart = 2189 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2190 ID.getInductionOpcode()); 2191 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2192 if (Trunc) 2193 addMetadata(EntryPart, Trunc); 2194 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2195 } 2196 }; 2197 2198 // Now do the actual transformations, and start with creating the step value. 2199 Value *Step = CreateStepValue(ID.getStep()); 2200 if (VF.isZero() || VF.isScalar()) { 2201 Value *ScalarIV = CreateScalarIV(Step); 2202 CreateSplatIV(ScalarIV, Step); 2203 return; 2204 } 2205 2206 // Determine if we want a scalar version of the induction variable. This is 2207 // true if the induction variable itself is not widened, or if it has at 2208 // least one user in the loop that is not widened. 2209 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2210 if (!NeedsScalarIV) { 2211 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2212 return; 2213 } 2214 2215 // Try to create a new independent vector induction variable. If we can't 2216 // create the phi node, we will splat the scalar induction variable in each 2217 // loop iteration. 2218 if (!shouldScalarizeInstruction(EntryVal)) { 2219 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2220 Value *ScalarIV = CreateScalarIV(Step); 2221 // Create scalar steps that can be used by instructions we will later 2222 // scalarize. Note that the addition of the scalar steps will not increase 2223 // the number of instructions in the loop in the common case prior to 2224 // InstCombine. We will be trading one vector extract for each scalar step. 2225 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2226 return; 2227 } 2228 2229 // All IV users are scalar instructions, so only emit a scalar IV, not a 2230 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2231 // predicate used by the masked loads/stores. 2232 Value *ScalarIV = CreateScalarIV(Step); 2233 if (!Cost->isScalarEpilogueAllowed()) 2234 CreateSplatIV(ScalarIV, Step); 2235 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2236 } 2237 2238 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2239 Instruction::BinaryOps BinOp) { 2240 // Create and check the types. 2241 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2242 int VLen = ValVTy->getNumElements(); 2243 2244 Type *STy = Val->getType()->getScalarType(); 2245 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2246 "Induction Step must be an integer or FP"); 2247 assert(Step->getType() == STy && "Step has wrong type"); 2248 2249 SmallVector<Constant *, 8> Indices; 2250 2251 if (STy->isIntegerTy()) { 2252 // Create a vector of consecutive numbers from zero to VF. 2253 for (int i = 0; i < VLen; ++i) 2254 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2255 2256 // Add the consecutive indices to the vector value. 2257 Constant *Cv = ConstantVector::get(Indices); 2258 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2259 Step = Builder.CreateVectorSplat(VLen, Step); 2260 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2261 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2262 // which can be found from the original scalar operations. 2263 Step = Builder.CreateMul(Cv, Step); 2264 return Builder.CreateAdd(Val, Step, "induction"); 2265 } 2266 2267 // Floating point induction. 2268 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2269 "Binary Opcode should be specified for FP induction"); 2270 // Create a vector of consecutive numbers from zero to VF. 2271 for (int i = 0; i < VLen; ++i) 2272 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2273 2274 // Add the consecutive indices to the vector value. 2275 Constant *Cv = ConstantVector::get(Indices); 2276 2277 Step = Builder.CreateVectorSplat(VLen, Step); 2278 2279 // Floating point operations had to be 'fast' to enable the induction. 2280 FastMathFlags Flags; 2281 Flags.setFast(); 2282 2283 Value *MulOp = Builder.CreateFMul(Cv, Step); 2284 if (isa<Instruction>(MulOp)) 2285 // Have to check, MulOp may be a constant 2286 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2287 2288 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2289 if (isa<Instruction>(BOp)) 2290 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2291 return BOp; 2292 } 2293 2294 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2295 Instruction *EntryVal, 2296 const InductionDescriptor &ID) { 2297 // We shouldn't have to build scalar steps if we aren't vectorizing. 2298 assert(VF.isVector() && "VF should be greater than one"); 2299 // Get the value type and ensure it and the step have the same integer type. 2300 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2301 assert(ScalarIVTy == Step->getType() && 2302 "Val and Step should have the same type"); 2303 2304 // We build scalar steps for both integer and floating-point induction 2305 // variables. Here, we determine the kind of arithmetic we will perform. 2306 Instruction::BinaryOps AddOp; 2307 Instruction::BinaryOps MulOp; 2308 if (ScalarIVTy->isIntegerTy()) { 2309 AddOp = Instruction::Add; 2310 MulOp = Instruction::Mul; 2311 } else { 2312 AddOp = ID.getInductionOpcode(); 2313 MulOp = Instruction::FMul; 2314 } 2315 2316 // Determine the number of scalars we need to generate for each unroll 2317 // iteration. If EntryVal is uniform, we only need to generate the first 2318 // lane. Otherwise, we generate all VF values. 2319 unsigned Lanes = 2320 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2321 ? 1 2322 : VF.getKnownMinValue(); 2323 assert((!VF.isScalable() || Lanes == 1) && 2324 "Should never scalarize a scalable vector"); 2325 // Compute the scalar steps and save the results in VectorLoopValueMap. 2326 for (unsigned Part = 0; Part < UF; ++Part) { 2327 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2328 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2329 ScalarIVTy->getScalarSizeInBits()); 2330 Value *StartIdx = 2331 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2332 if (ScalarIVTy->isFloatingPointTy()) 2333 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2334 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2335 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2336 // The step returned by `createStepForVF` is a runtime-evaluated value 2337 // when VF is scalable. Otherwise, it should be folded into a Constant. 2338 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2339 "Expected StartIdx to be folded to a constant when VF is not " 2340 "scalable"); 2341 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2342 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2343 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2344 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2345 } 2346 } 2347 } 2348 2349 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2350 assert(V != Induction && "The new induction variable should not be used."); 2351 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2352 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2353 2354 // If we have a stride that is replaced by one, do it here. Defer this for 2355 // the VPlan-native path until we start running Legal checks in that path. 2356 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2357 V = ConstantInt::get(V->getType(), 1); 2358 2359 // If we have a vector mapped to this value, return it. 2360 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2361 return VectorLoopValueMap.getVectorValue(V, Part); 2362 2363 // If the value has not been vectorized, check if it has been scalarized 2364 // instead. If it has been scalarized, and we actually need the value in 2365 // vector form, we will construct the vector values on demand. 2366 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2367 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2368 2369 // If we've scalarized a value, that value should be an instruction. 2370 auto *I = cast<Instruction>(V); 2371 2372 // If we aren't vectorizing, we can just copy the scalar map values over to 2373 // the vector map. 2374 if (VF.isScalar()) { 2375 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2376 return ScalarValue; 2377 } 2378 2379 // Get the last scalar instruction we generated for V and Part. If the value 2380 // is known to be uniform after vectorization, this corresponds to lane zero 2381 // of the Part unroll iteration. Otherwise, the last instruction is the one 2382 // we created for the last vector lane of the Part unroll iteration. 2383 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2384 ? 0 2385 : VF.getKnownMinValue() - 1; 2386 assert((!VF.isScalable() || LastLane == 0) && 2387 "Scalable vectorization can't lead to any scalarized values."); 2388 auto *LastInst = cast<Instruction>( 2389 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2390 2391 // Set the insert point after the last scalarized instruction. This ensures 2392 // the insertelement sequence will directly follow the scalar definitions. 2393 auto OldIP = Builder.saveIP(); 2394 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2395 Builder.SetInsertPoint(&*NewIP); 2396 2397 // However, if we are vectorizing, we need to construct the vector values. 2398 // If the value is known to be uniform after vectorization, we can just 2399 // broadcast the scalar value corresponding to lane zero for each unroll 2400 // iteration. Otherwise, we construct the vector values using insertelement 2401 // instructions. Since the resulting vectors are stored in 2402 // VectorLoopValueMap, we will only generate the insertelements once. 2403 Value *VectorValue = nullptr; 2404 if (Cost->isUniformAfterVectorization(I, VF)) { 2405 VectorValue = getBroadcastInstrs(ScalarValue); 2406 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2407 } else { 2408 // Initialize packing with insertelements to start from undef. 2409 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2410 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2411 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2412 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2413 packScalarIntoVectorValue(V, {Part, Lane}); 2414 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2415 } 2416 Builder.restoreIP(OldIP); 2417 return VectorValue; 2418 } 2419 2420 // If this scalar is unknown, assume that it is a constant or that it is 2421 // loop invariant. Broadcast V and save the value for future uses. 2422 Value *B = getBroadcastInstrs(V); 2423 VectorLoopValueMap.setVectorValue(V, Part, B); 2424 return B; 2425 } 2426 2427 Value * 2428 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2429 const VPIteration &Instance) { 2430 // If the value is not an instruction contained in the loop, it should 2431 // already be scalar. 2432 if (OrigLoop->isLoopInvariant(V)) 2433 return V; 2434 2435 assert(Instance.Lane > 0 2436 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2437 : true && "Uniform values only have lane zero"); 2438 2439 // If the value from the original loop has not been vectorized, it is 2440 // represented by UF x VF scalar values in the new loop. Return the requested 2441 // scalar value. 2442 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2443 return VectorLoopValueMap.getScalarValue(V, Instance); 2444 2445 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2446 // for the given unroll part. If this entry is not a vector type (i.e., the 2447 // vectorization factor is one), there is no need to generate an 2448 // extractelement instruction. 2449 auto *U = getOrCreateVectorValue(V, Instance.Part); 2450 if (!U->getType()->isVectorTy()) { 2451 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2452 return U; 2453 } 2454 2455 // Otherwise, the value from the original loop has been vectorized and is 2456 // represented by UF vector values. Extract and return the requested scalar 2457 // value from the appropriate vector lane. 2458 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2459 } 2460 2461 void InnerLoopVectorizer::packScalarIntoVectorValue( 2462 Value *V, const VPIteration &Instance) { 2463 assert(V != Induction && "The new induction variable should not be used."); 2464 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2465 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2466 2467 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2468 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2469 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2470 Builder.getInt32(Instance.Lane)); 2471 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2472 } 2473 2474 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2475 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2476 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2477 SmallVector<int, 8> ShuffleMask; 2478 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2479 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2480 2481 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2482 } 2483 2484 // Return whether we allow using masked interleave-groups (for dealing with 2485 // strided loads/stores that reside in predicated blocks, or for dealing 2486 // with gaps). 2487 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2488 // If an override option has been passed in for interleaved accesses, use it. 2489 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2490 return EnableMaskedInterleavedMemAccesses; 2491 2492 return TTI.enableMaskedInterleavedAccessVectorization(); 2493 } 2494 2495 // Try to vectorize the interleave group that \p Instr belongs to. 2496 // 2497 // E.g. Translate following interleaved load group (factor = 3): 2498 // for (i = 0; i < N; i+=3) { 2499 // R = Pic[i]; // Member of index 0 2500 // G = Pic[i+1]; // Member of index 1 2501 // B = Pic[i+2]; // Member of index 2 2502 // ... // do something to R, G, B 2503 // } 2504 // To: 2505 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2506 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2507 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2508 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2509 // 2510 // Or translate following interleaved store group (factor = 3): 2511 // for (i = 0; i < N; i+=3) { 2512 // ... do something to R, G, B 2513 // Pic[i] = R; // Member of index 0 2514 // Pic[i+1] = G; // Member of index 1 2515 // Pic[i+2] = B; // Member of index 2 2516 // } 2517 // To: 2518 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2519 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2520 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2521 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2522 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2523 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2524 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2525 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2526 VPValue *BlockInMask) { 2527 Instruction *Instr = Group->getInsertPos(); 2528 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2529 2530 // Prepare for the vector type of the interleaved load/store. 2531 Type *ScalarTy = getMemInstValueType(Instr); 2532 unsigned InterleaveFactor = Group->getFactor(); 2533 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2534 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2535 2536 // Prepare for the new pointers. 2537 SmallVector<Value *, 2> AddrParts; 2538 unsigned Index = Group->getIndex(Instr); 2539 2540 // TODO: extend the masked interleaved-group support to reversed access. 2541 assert((!BlockInMask || !Group->isReverse()) && 2542 "Reversed masked interleave-group not supported."); 2543 2544 // If the group is reverse, adjust the index to refer to the last vector lane 2545 // instead of the first. We adjust the index from the first vector lane, 2546 // rather than directly getting the pointer for lane VF - 1, because the 2547 // pointer operand of the interleaved access is supposed to be uniform. For 2548 // uniform instructions, we're only required to generate a value for the 2549 // first vector lane in each unroll iteration. 2550 assert(!VF.isScalable() && 2551 "scalable vector reverse operation is not implemented"); 2552 if (Group->isReverse()) 2553 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2554 2555 for (unsigned Part = 0; Part < UF; Part++) { 2556 Value *AddrPart = State.get(Addr, {Part, 0}); 2557 setDebugLocFromInst(Builder, AddrPart); 2558 2559 // Notice current instruction could be any index. Need to adjust the address 2560 // to the member of index 0. 2561 // 2562 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2563 // b = A[i]; // Member of index 0 2564 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2565 // 2566 // E.g. A[i+1] = a; // Member of index 1 2567 // A[i] = b; // Member of index 0 2568 // A[i+2] = c; // Member of index 2 (Current instruction) 2569 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2570 2571 bool InBounds = false; 2572 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2573 InBounds = gep->isInBounds(); 2574 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2575 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2576 2577 // Cast to the vector pointer type. 2578 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2579 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2580 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2581 } 2582 2583 setDebugLocFromInst(Builder, Instr); 2584 Value *UndefVec = UndefValue::get(VecTy); 2585 2586 Value *MaskForGaps = nullptr; 2587 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2588 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2589 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2590 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2591 } 2592 2593 // Vectorize the interleaved load group. 2594 if (isa<LoadInst>(Instr)) { 2595 // For each unroll part, create a wide load for the group. 2596 SmallVector<Value *, 2> NewLoads; 2597 for (unsigned Part = 0; Part < UF; Part++) { 2598 Instruction *NewLoad; 2599 if (BlockInMask || MaskForGaps) { 2600 assert(useMaskedInterleavedAccesses(*TTI) && 2601 "masked interleaved groups are not allowed."); 2602 Value *GroupMask = MaskForGaps; 2603 if (BlockInMask) { 2604 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2605 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2606 Value *ShuffledMask = Builder.CreateShuffleVector( 2607 BlockInMaskPart, 2608 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2609 "interleaved.mask"); 2610 GroupMask = MaskForGaps 2611 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2612 MaskForGaps) 2613 : ShuffledMask; 2614 } 2615 NewLoad = 2616 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2617 GroupMask, UndefVec, "wide.masked.vec"); 2618 } 2619 else 2620 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2621 Group->getAlign(), "wide.vec"); 2622 Group->addMetadata(NewLoad); 2623 NewLoads.push_back(NewLoad); 2624 } 2625 2626 // For each member in the group, shuffle out the appropriate data from the 2627 // wide loads. 2628 unsigned J = 0; 2629 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2630 Instruction *Member = Group->getMember(I); 2631 2632 // Skip the gaps in the group. 2633 if (!Member) 2634 continue; 2635 2636 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2637 auto StrideMask = 2638 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2639 for (unsigned Part = 0; Part < UF; Part++) { 2640 Value *StridedVec = Builder.CreateShuffleVector( 2641 NewLoads[Part], StrideMask, "strided.vec"); 2642 2643 // If this member has different type, cast the result type. 2644 if (Member->getType() != ScalarTy) { 2645 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2646 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2647 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2648 } 2649 2650 if (Group->isReverse()) 2651 StridedVec = reverseVector(StridedVec); 2652 2653 State.set(VPDefs[J], Member, StridedVec, Part); 2654 } 2655 ++J; 2656 } 2657 return; 2658 } 2659 2660 // The sub vector type for current instruction. 2661 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2662 auto *SubVT = VectorType::get(ScalarTy, VF); 2663 2664 // Vectorize the interleaved store group. 2665 for (unsigned Part = 0; Part < UF; Part++) { 2666 // Collect the stored vector from each member. 2667 SmallVector<Value *, 4> StoredVecs; 2668 for (unsigned i = 0; i < InterleaveFactor; i++) { 2669 // Interleaved store group doesn't allow a gap, so each index has a member 2670 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2671 2672 Value *StoredVec = State.get(StoredValues[i], Part); 2673 2674 if (Group->isReverse()) 2675 StoredVec = reverseVector(StoredVec); 2676 2677 // If this member has different type, cast it to a unified type. 2678 2679 if (StoredVec->getType() != SubVT) 2680 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2681 2682 StoredVecs.push_back(StoredVec); 2683 } 2684 2685 // Concatenate all vectors into a wide vector. 2686 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2687 2688 // Interleave the elements in the wide vector. 2689 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2690 Value *IVec = Builder.CreateShuffleVector( 2691 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2692 "interleaved.vec"); 2693 2694 Instruction *NewStoreInstr; 2695 if (BlockInMask) { 2696 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2697 Value *ShuffledMask = Builder.CreateShuffleVector( 2698 BlockInMaskPart, 2699 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2700 "interleaved.mask"); 2701 NewStoreInstr = Builder.CreateMaskedStore( 2702 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2703 } 2704 else 2705 NewStoreInstr = 2706 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2707 2708 Group->addMetadata(NewStoreInstr); 2709 } 2710 } 2711 2712 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2713 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2714 VPValue *StoredValue, VPValue *BlockInMask) { 2715 // Attempt to issue a wide load. 2716 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2717 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2718 2719 assert((LI || SI) && "Invalid Load/Store instruction"); 2720 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2721 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2722 2723 LoopVectorizationCostModel::InstWidening Decision = 2724 Cost->getWideningDecision(Instr, VF); 2725 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2726 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2727 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2728 "CM decision is not to widen the memory instruction"); 2729 2730 Type *ScalarDataTy = getMemInstValueType(Instr); 2731 2732 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2733 const Align Alignment = getLoadStoreAlignment(Instr); 2734 2735 // Determine if the pointer operand of the access is either consecutive or 2736 // reverse consecutive. 2737 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2738 bool ConsecutiveStride = 2739 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2740 bool CreateGatherScatter = 2741 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2742 2743 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2744 // gather/scatter. Otherwise Decision should have been to Scalarize. 2745 assert((ConsecutiveStride || CreateGatherScatter) && 2746 "The instruction should be scalarized"); 2747 (void)ConsecutiveStride; 2748 2749 VectorParts BlockInMaskParts(UF); 2750 bool isMaskRequired = BlockInMask; 2751 if (isMaskRequired) 2752 for (unsigned Part = 0; Part < UF; ++Part) 2753 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2754 2755 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2756 // Calculate the pointer for the specific unroll-part. 2757 GetElementPtrInst *PartPtr = nullptr; 2758 2759 bool InBounds = false; 2760 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2761 InBounds = gep->isInBounds(); 2762 2763 if (Reverse) { 2764 assert(!VF.isScalable() && 2765 "Reversing vectors is not yet supported for scalable vectors."); 2766 2767 // If the address is consecutive but reversed, then the 2768 // wide store needs to start at the last vector element. 2769 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2770 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2771 PartPtr->setIsInBounds(InBounds); 2772 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2773 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2774 PartPtr->setIsInBounds(InBounds); 2775 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2776 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2777 } else { 2778 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2779 PartPtr = cast<GetElementPtrInst>( 2780 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2781 PartPtr->setIsInBounds(InBounds); 2782 } 2783 2784 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2785 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2786 }; 2787 2788 // Handle Stores: 2789 if (SI) { 2790 setDebugLocFromInst(Builder, SI); 2791 2792 for (unsigned Part = 0; Part < UF; ++Part) { 2793 Instruction *NewSI = nullptr; 2794 Value *StoredVal = State.get(StoredValue, Part); 2795 if (CreateGatherScatter) { 2796 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2797 Value *VectorGep = State.get(Addr, Part); 2798 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2799 MaskPart); 2800 } else { 2801 if (Reverse) { 2802 // If we store to reverse consecutive memory locations, then we need 2803 // to reverse the order of elements in the stored value. 2804 StoredVal = reverseVector(StoredVal); 2805 // We don't want to update the value in the map as it might be used in 2806 // another expression. So don't call resetVectorValue(StoredVal). 2807 } 2808 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2809 if (isMaskRequired) 2810 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2811 BlockInMaskParts[Part]); 2812 else 2813 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2814 } 2815 addMetadata(NewSI, SI); 2816 } 2817 return; 2818 } 2819 2820 // Handle loads. 2821 assert(LI && "Must have a load instruction"); 2822 setDebugLocFromInst(Builder, LI); 2823 for (unsigned Part = 0; Part < UF; ++Part) { 2824 Value *NewLI; 2825 if (CreateGatherScatter) { 2826 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2827 Value *VectorGep = State.get(Addr, Part); 2828 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2829 nullptr, "wide.masked.gather"); 2830 addMetadata(NewLI, LI); 2831 } else { 2832 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2833 if (isMaskRequired) 2834 NewLI = Builder.CreateMaskedLoad( 2835 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2836 "wide.masked.load"); 2837 else 2838 NewLI = 2839 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2840 2841 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2842 addMetadata(NewLI, LI); 2843 if (Reverse) 2844 NewLI = reverseVector(NewLI); 2845 } 2846 2847 State.set(Def, Instr, NewLI, Part); 2848 } 2849 } 2850 2851 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2852 const VPIteration &Instance, 2853 bool IfPredicateInstr, 2854 VPTransformState &State) { 2855 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2856 2857 setDebugLocFromInst(Builder, Instr); 2858 2859 // Does this instruction return a value ? 2860 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2861 2862 Instruction *Cloned = Instr->clone(); 2863 if (!IsVoidRetTy) 2864 Cloned->setName(Instr->getName() + ".cloned"); 2865 2866 // Replace the operands of the cloned instructions with their scalar 2867 // equivalents in the new loop. 2868 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2869 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2870 auto InputInstance = Instance; 2871 if (!Operand || !OrigLoop->contains(Operand) || 2872 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2873 InputInstance.Lane = 0; 2874 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2875 Cloned->setOperand(op, NewOp); 2876 } 2877 addNewMetadata(Cloned, Instr); 2878 2879 // Place the cloned scalar in the new loop. 2880 Builder.Insert(Cloned); 2881 2882 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2883 // representing scalar values in VPTransformState. Add the cloned scalar to 2884 // the scalar map entry. 2885 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2886 2887 // If we just cloned a new assumption, add it the assumption cache. 2888 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2889 if (II->getIntrinsicID() == Intrinsic::assume) 2890 AC->registerAssumption(II); 2891 2892 // End if-block. 2893 if (IfPredicateInstr) 2894 PredicatedInstructions.push_back(Cloned); 2895 } 2896 2897 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2898 Value *End, Value *Step, 2899 Instruction *DL) { 2900 BasicBlock *Header = L->getHeader(); 2901 BasicBlock *Latch = L->getLoopLatch(); 2902 // As we're just creating this loop, it's possible no latch exists 2903 // yet. If so, use the header as this will be a single block loop. 2904 if (!Latch) 2905 Latch = Header; 2906 2907 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2908 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2909 setDebugLocFromInst(Builder, OldInst); 2910 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2911 2912 Builder.SetInsertPoint(Latch->getTerminator()); 2913 setDebugLocFromInst(Builder, OldInst); 2914 2915 // Create i+1 and fill the PHINode. 2916 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2917 Induction->addIncoming(Start, L->getLoopPreheader()); 2918 Induction->addIncoming(Next, Latch); 2919 // Create the compare. 2920 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2921 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2922 2923 // Now we have two terminators. Remove the old one from the block. 2924 Latch->getTerminator()->eraseFromParent(); 2925 2926 return Induction; 2927 } 2928 2929 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2930 if (TripCount) 2931 return TripCount; 2932 2933 assert(L && "Create Trip Count for null loop."); 2934 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2935 // Find the loop boundaries. 2936 ScalarEvolution *SE = PSE.getSE(); 2937 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2938 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2939 "Invalid loop count"); 2940 2941 Type *IdxTy = Legal->getWidestInductionType(); 2942 assert(IdxTy && "No type for induction"); 2943 2944 // The exit count might have the type of i64 while the phi is i32. This can 2945 // happen if we have an induction variable that is sign extended before the 2946 // compare. The only way that we get a backedge taken count is that the 2947 // induction variable was signed and as such will not overflow. In such a case 2948 // truncation is legal. 2949 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2950 IdxTy->getPrimitiveSizeInBits()) 2951 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2952 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2953 2954 // Get the total trip count from the count by adding 1. 2955 const SCEV *ExitCount = SE->getAddExpr( 2956 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2957 2958 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2959 2960 // Expand the trip count and place the new instructions in the preheader. 2961 // Notice that the pre-header does not change, only the loop body. 2962 SCEVExpander Exp(*SE, DL, "induction"); 2963 2964 // Count holds the overall loop count (N). 2965 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2966 L->getLoopPreheader()->getTerminator()); 2967 2968 if (TripCount->getType()->isPointerTy()) 2969 TripCount = 2970 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2971 L->getLoopPreheader()->getTerminator()); 2972 2973 return TripCount; 2974 } 2975 2976 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2977 if (VectorTripCount) 2978 return VectorTripCount; 2979 2980 Value *TC = getOrCreateTripCount(L); 2981 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2982 2983 Type *Ty = TC->getType(); 2984 // This is where we can make the step a runtime constant. 2985 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2986 2987 // If the tail is to be folded by masking, round the number of iterations N 2988 // up to a multiple of Step instead of rounding down. This is done by first 2989 // adding Step-1 and then rounding down. Note that it's ok if this addition 2990 // overflows: the vector induction variable will eventually wrap to zero given 2991 // that it starts at zero and its Step is a power of two; the loop will then 2992 // exit, with the last early-exit vector comparison also producing all-true. 2993 if (Cost->foldTailByMasking()) { 2994 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2995 "VF*UF must be a power of 2 when folding tail by masking"); 2996 assert(!VF.isScalable() && 2997 "Tail folding not yet supported for scalable vectors"); 2998 TC = Builder.CreateAdd( 2999 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3000 } 3001 3002 // Now we need to generate the expression for the part of the loop that the 3003 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3004 // iterations are not required for correctness, or N - Step, otherwise. Step 3005 // is equal to the vectorization factor (number of SIMD elements) times the 3006 // unroll factor (number of SIMD instructions). 3007 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3008 3009 // There are two cases where we need to ensure (at least) the last iteration 3010 // runs in the scalar remainder loop. Thus, if the step evenly divides 3011 // the trip count, we set the remainder to be equal to the step. If the step 3012 // does not evenly divide the trip count, no adjustment is necessary since 3013 // there will already be scalar iterations. Note that the minimum iterations 3014 // check ensures that N >= Step. The cases are: 3015 // 1) If there is a non-reversed interleaved group that may speculatively 3016 // access memory out-of-bounds. 3017 // 2) If any instruction may follow a conditionally taken exit. That is, if 3018 // the loop contains multiple exiting blocks, or a single exiting block 3019 // which is not the latch. 3020 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3021 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3022 R = Builder.CreateSelect(IsZero, Step, R); 3023 } 3024 3025 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3026 3027 return VectorTripCount; 3028 } 3029 3030 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3031 const DataLayout &DL) { 3032 // Verify that V is a vector type with same number of elements as DstVTy. 3033 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3034 unsigned VF = DstFVTy->getNumElements(); 3035 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3036 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3037 Type *SrcElemTy = SrcVecTy->getElementType(); 3038 Type *DstElemTy = DstFVTy->getElementType(); 3039 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3040 "Vector elements must have same size"); 3041 3042 // Do a direct cast if element types are castable. 3043 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3044 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3045 } 3046 // V cannot be directly casted to desired vector type. 3047 // May happen when V is a floating point vector but DstVTy is a vector of 3048 // pointers or vice-versa. Handle this using a two-step bitcast using an 3049 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3050 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3051 "Only one type should be a pointer type"); 3052 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3053 "Only one type should be a floating point type"); 3054 Type *IntTy = 3055 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3056 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3057 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3058 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3059 } 3060 3061 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3062 BasicBlock *Bypass) { 3063 Value *Count = getOrCreateTripCount(L); 3064 // Reuse existing vector loop preheader for TC checks. 3065 // Note that new preheader block is generated for vector loop. 3066 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3067 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3068 3069 // Generate code to check if the loop's trip count is less than VF * UF, or 3070 // equal to it in case a scalar epilogue is required; this implies that the 3071 // vector trip count is zero. This check also covers the case where adding one 3072 // to the backedge-taken count overflowed leading to an incorrect trip count 3073 // of zero. In this case we will also jump to the scalar loop. 3074 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3075 : ICmpInst::ICMP_ULT; 3076 3077 // If tail is to be folded, vector loop takes care of all iterations. 3078 Value *CheckMinIters = Builder.getFalse(); 3079 if (!Cost->foldTailByMasking()) { 3080 Value *Step = 3081 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3082 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3083 } 3084 // Create new preheader for vector loop. 3085 LoopVectorPreHeader = 3086 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3087 "vector.ph"); 3088 3089 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3090 DT->getNode(Bypass)->getIDom()) && 3091 "TC check is expected to dominate Bypass"); 3092 3093 // Update dominator for Bypass & LoopExit. 3094 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3095 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3096 3097 ReplaceInstWithInst( 3098 TCCheckBlock->getTerminator(), 3099 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3100 LoopBypassBlocks.push_back(TCCheckBlock); 3101 } 3102 3103 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3104 // Reuse existing vector loop preheader for SCEV checks. 3105 // Note that new preheader block is generated for vector loop. 3106 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3107 3108 // Generate the code to check that the SCEV assumptions that we made. 3109 // We want the new basic block to start at the first instruction in a 3110 // sequence of instructions that form a check. 3111 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3112 "scev.check"); 3113 Value *SCEVCheck = Exp.expandCodeForPredicate( 3114 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3115 3116 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3117 if (C->isZero()) 3118 return; 3119 3120 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3121 (OptForSizeBasedOnProfile && 3122 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3123 "Cannot SCEV check stride or overflow when optimizing for size"); 3124 3125 SCEVCheckBlock->setName("vector.scevcheck"); 3126 // Create new preheader for vector loop. 3127 LoopVectorPreHeader = 3128 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3129 nullptr, "vector.ph"); 3130 3131 // Update dominator only if this is first RT check. 3132 if (LoopBypassBlocks.empty()) { 3133 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3134 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3135 } 3136 3137 ReplaceInstWithInst( 3138 SCEVCheckBlock->getTerminator(), 3139 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3140 LoopBypassBlocks.push_back(SCEVCheckBlock); 3141 AddedSafetyChecks = true; 3142 } 3143 3144 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3145 // VPlan-native path does not do any analysis for runtime checks currently. 3146 if (EnableVPlanNativePath) 3147 return; 3148 3149 // Reuse existing vector loop preheader for runtime memory checks. 3150 // Note that new preheader block is generated for vector loop. 3151 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3152 3153 // Generate the code that checks in runtime if arrays overlap. We put the 3154 // checks into a separate block to make the more common case of few elements 3155 // faster. 3156 auto *LAI = Legal->getLAI(); 3157 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3158 if (!RtPtrChecking.Need) 3159 return; 3160 3161 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3162 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3163 "Cannot emit memory checks when optimizing for size, unless forced " 3164 "to vectorize."); 3165 ORE->emit([&]() { 3166 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3167 L->getStartLoc(), L->getHeader()) 3168 << "Code-size may be reduced by not forcing " 3169 "vectorization, or by source-code modifications " 3170 "eliminating the need for runtime checks " 3171 "(e.g., adding 'restrict')."; 3172 }); 3173 } 3174 3175 MemCheckBlock->setName("vector.memcheck"); 3176 // Create new preheader for vector loop. 3177 LoopVectorPreHeader = 3178 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3179 "vector.ph"); 3180 3181 auto *CondBranch = cast<BranchInst>( 3182 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3183 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3184 LoopBypassBlocks.push_back(MemCheckBlock); 3185 AddedSafetyChecks = true; 3186 3187 // Update dominator only if this is first RT check. 3188 if (LoopBypassBlocks.empty()) { 3189 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3190 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3191 } 3192 3193 Instruction *FirstCheckInst; 3194 Instruction *MemRuntimeCheck; 3195 std::tie(FirstCheckInst, MemRuntimeCheck) = 3196 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3197 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3198 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3199 "claimed checks are required"); 3200 CondBranch->setCondition(MemRuntimeCheck); 3201 3202 // We currently don't use LoopVersioning for the actual loop cloning but we 3203 // still use it to add the noalias metadata. 3204 LVer = std::make_unique<LoopVersioning>( 3205 *Legal->getLAI(), 3206 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3207 DT, PSE.getSE()); 3208 LVer->prepareNoAliasMetadata(); 3209 } 3210 3211 Value *InnerLoopVectorizer::emitTransformedIndex( 3212 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3213 const InductionDescriptor &ID) const { 3214 3215 SCEVExpander Exp(*SE, DL, "induction"); 3216 auto Step = ID.getStep(); 3217 auto StartValue = ID.getStartValue(); 3218 assert(Index->getType() == Step->getType() && 3219 "Index type does not match StepValue type"); 3220 3221 // Note: the IR at this point is broken. We cannot use SE to create any new 3222 // SCEV and then expand it, hoping that SCEV's simplification will give us 3223 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3224 // lead to various SCEV crashes. So all we can do is to use builder and rely 3225 // on InstCombine for future simplifications. Here we handle some trivial 3226 // cases only. 3227 auto CreateAdd = [&B](Value *X, Value *Y) { 3228 assert(X->getType() == Y->getType() && "Types don't match!"); 3229 if (auto *CX = dyn_cast<ConstantInt>(X)) 3230 if (CX->isZero()) 3231 return Y; 3232 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3233 if (CY->isZero()) 3234 return X; 3235 return B.CreateAdd(X, Y); 3236 }; 3237 3238 auto CreateMul = [&B](Value *X, Value *Y) { 3239 assert(X->getType() == Y->getType() && "Types don't match!"); 3240 if (auto *CX = dyn_cast<ConstantInt>(X)) 3241 if (CX->isOne()) 3242 return Y; 3243 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3244 if (CY->isOne()) 3245 return X; 3246 return B.CreateMul(X, Y); 3247 }; 3248 3249 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3250 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3251 // the DomTree is not kept up-to-date for additional blocks generated in the 3252 // vector loop. By using the header as insertion point, we guarantee that the 3253 // expanded instructions dominate all their uses. 3254 auto GetInsertPoint = [this, &B]() { 3255 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3256 if (InsertBB != LoopVectorBody && 3257 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3258 return LoopVectorBody->getTerminator(); 3259 return &*B.GetInsertPoint(); 3260 }; 3261 switch (ID.getKind()) { 3262 case InductionDescriptor::IK_IntInduction: { 3263 assert(Index->getType() == StartValue->getType() && 3264 "Index type does not match StartValue type"); 3265 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3266 return B.CreateSub(StartValue, Index); 3267 auto *Offset = CreateMul( 3268 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3269 return CreateAdd(StartValue, Offset); 3270 } 3271 case InductionDescriptor::IK_PtrInduction: { 3272 assert(isa<SCEVConstant>(Step) && 3273 "Expected constant step for pointer induction"); 3274 return B.CreateGEP( 3275 StartValue->getType()->getPointerElementType(), StartValue, 3276 CreateMul(Index, 3277 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3278 } 3279 case InductionDescriptor::IK_FpInduction: { 3280 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3281 auto InductionBinOp = ID.getInductionBinOp(); 3282 assert(InductionBinOp && 3283 (InductionBinOp->getOpcode() == Instruction::FAdd || 3284 InductionBinOp->getOpcode() == Instruction::FSub) && 3285 "Original bin op should be defined for FP induction"); 3286 3287 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3288 3289 // Floating point operations had to be 'fast' to enable the induction. 3290 FastMathFlags Flags; 3291 Flags.setFast(); 3292 3293 Value *MulExp = B.CreateFMul(StepValue, Index); 3294 if (isa<Instruction>(MulExp)) 3295 // We have to check, the MulExp may be a constant. 3296 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3297 3298 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3299 "induction"); 3300 if (isa<Instruction>(BOp)) 3301 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3302 3303 return BOp; 3304 } 3305 case InductionDescriptor::IK_NoInduction: 3306 return nullptr; 3307 } 3308 llvm_unreachable("invalid enum"); 3309 } 3310 3311 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3312 LoopScalarBody = OrigLoop->getHeader(); 3313 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3314 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3315 assert(LoopExitBlock && "Must have an exit block"); 3316 assert(LoopVectorPreHeader && "Invalid loop structure"); 3317 3318 LoopMiddleBlock = 3319 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3320 LI, nullptr, Twine(Prefix) + "middle.block"); 3321 LoopScalarPreHeader = 3322 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3323 nullptr, Twine(Prefix) + "scalar.ph"); 3324 3325 // Set up branch from middle block to the exit and scalar preheader blocks. 3326 // completeLoopSkeleton will update the condition to use an iteration check, 3327 // if required to decide whether to execute the remainder. 3328 BranchInst *BrInst = 3329 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3330 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3331 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3332 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3333 3334 // We intentionally don't let SplitBlock to update LoopInfo since 3335 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3336 // LoopVectorBody is explicitly added to the correct place few lines later. 3337 LoopVectorBody = 3338 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3339 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3340 3341 // Update dominator for loop exit. 3342 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3343 3344 // Create and register the new vector loop. 3345 Loop *Lp = LI->AllocateLoop(); 3346 Loop *ParentLoop = OrigLoop->getParentLoop(); 3347 3348 // Insert the new loop into the loop nest and register the new basic blocks 3349 // before calling any utilities such as SCEV that require valid LoopInfo. 3350 if (ParentLoop) { 3351 ParentLoop->addChildLoop(Lp); 3352 } else { 3353 LI->addTopLevelLoop(Lp); 3354 } 3355 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3356 return Lp; 3357 } 3358 3359 void InnerLoopVectorizer::createInductionResumeValues( 3360 Loop *L, Value *VectorTripCount, 3361 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3362 assert(VectorTripCount && L && "Expected valid arguments"); 3363 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3364 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3365 "Inconsistent information about additional bypass."); 3366 // We are going to resume the execution of the scalar loop. 3367 // Go over all of the induction variables that we found and fix the 3368 // PHIs that are left in the scalar version of the loop. 3369 // The starting values of PHI nodes depend on the counter of the last 3370 // iteration in the vectorized loop. 3371 // If we come from a bypass edge then we need to start from the original 3372 // start value. 3373 for (auto &InductionEntry : Legal->getInductionVars()) { 3374 PHINode *OrigPhi = InductionEntry.first; 3375 InductionDescriptor II = InductionEntry.second; 3376 3377 // Create phi nodes to merge from the backedge-taken check block. 3378 PHINode *BCResumeVal = 3379 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3380 LoopScalarPreHeader->getTerminator()); 3381 // Copy original phi DL over to the new one. 3382 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3383 Value *&EndValue = IVEndValues[OrigPhi]; 3384 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3385 if (OrigPhi == OldInduction) { 3386 // We know what the end value is. 3387 EndValue = VectorTripCount; 3388 } else { 3389 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3390 Type *StepType = II.getStep()->getType(); 3391 Instruction::CastOps CastOp = 3392 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3393 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3394 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3395 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3396 EndValue->setName("ind.end"); 3397 3398 // Compute the end value for the additional bypass (if applicable). 3399 if (AdditionalBypass.first) { 3400 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3401 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3402 StepType, true); 3403 CRD = 3404 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3405 EndValueFromAdditionalBypass = 3406 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3407 EndValueFromAdditionalBypass->setName("ind.end"); 3408 } 3409 } 3410 // The new PHI merges the original incoming value, in case of a bypass, 3411 // or the value at the end of the vectorized loop. 3412 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3413 3414 // Fix the scalar body counter (PHI node). 3415 // The old induction's phi node in the scalar body needs the truncated 3416 // value. 3417 for (BasicBlock *BB : LoopBypassBlocks) 3418 BCResumeVal->addIncoming(II.getStartValue(), BB); 3419 3420 if (AdditionalBypass.first) 3421 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3422 EndValueFromAdditionalBypass); 3423 3424 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3425 } 3426 } 3427 3428 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3429 MDNode *OrigLoopID) { 3430 assert(L && "Expected valid loop."); 3431 3432 // The trip counts should be cached by now. 3433 Value *Count = getOrCreateTripCount(L); 3434 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3435 3436 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3437 3438 // Add a check in the middle block to see if we have completed 3439 // all of the iterations in the first vector loop. 3440 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3441 // If tail is to be folded, we know we don't need to run the remainder. 3442 if (!Cost->foldTailByMasking()) { 3443 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3444 Count, VectorTripCount, "cmp.n", 3445 LoopMiddleBlock->getTerminator()); 3446 3447 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3448 // of the corresponding compare because they may have ended up with 3449 // different line numbers and we want to avoid awkward line stepping while 3450 // debugging. Eg. if the compare has got a line number inside the loop. 3451 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3452 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3453 } 3454 3455 // Get ready to start creating new instructions into the vectorized body. 3456 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3457 "Inconsistent vector loop preheader"); 3458 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3459 3460 Optional<MDNode *> VectorizedLoopID = 3461 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3462 LLVMLoopVectorizeFollowupVectorized}); 3463 if (VectorizedLoopID.hasValue()) { 3464 L->setLoopID(VectorizedLoopID.getValue()); 3465 3466 // Do not setAlreadyVectorized if loop attributes have been defined 3467 // explicitly. 3468 return LoopVectorPreHeader; 3469 } 3470 3471 // Keep all loop hints from the original loop on the vector loop (we'll 3472 // replace the vectorizer-specific hints below). 3473 if (MDNode *LID = OrigLoop->getLoopID()) 3474 L->setLoopID(LID); 3475 3476 LoopVectorizeHints Hints(L, true, *ORE); 3477 Hints.setAlreadyVectorized(); 3478 3479 #ifdef EXPENSIVE_CHECKS 3480 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3481 LI->verify(*DT); 3482 #endif 3483 3484 return LoopVectorPreHeader; 3485 } 3486 3487 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3488 /* 3489 In this function we generate a new loop. The new loop will contain 3490 the vectorized instructions while the old loop will continue to run the 3491 scalar remainder. 3492 3493 [ ] <-- loop iteration number check. 3494 / | 3495 / v 3496 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3497 | / | 3498 | / v 3499 || [ ] <-- vector pre header. 3500 |/ | 3501 | v 3502 | [ ] \ 3503 | [ ]_| <-- vector loop. 3504 | | 3505 | v 3506 | -[ ] <--- middle-block. 3507 | / | 3508 | / v 3509 -|- >[ ] <--- new preheader. 3510 | | 3511 | v 3512 | [ ] \ 3513 | [ ]_| <-- old scalar loop to handle remainder. 3514 \ | 3515 \ v 3516 >[ ] <-- exit block. 3517 ... 3518 */ 3519 3520 // Get the metadata of the original loop before it gets modified. 3521 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3522 3523 // Create an empty vector loop, and prepare basic blocks for the runtime 3524 // checks. 3525 Loop *Lp = createVectorLoopSkeleton(""); 3526 3527 // Now, compare the new count to zero. If it is zero skip the vector loop and 3528 // jump to the scalar loop. This check also covers the case where the 3529 // backedge-taken count is uint##_max: adding one to it will overflow leading 3530 // to an incorrect trip count of zero. In this (rare) case we will also jump 3531 // to the scalar loop. 3532 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3533 3534 // Generate the code to check any assumptions that we've made for SCEV 3535 // expressions. 3536 emitSCEVChecks(Lp, LoopScalarPreHeader); 3537 3538 // Generate the code that checks in runtime if arrays overlap. We put the 3539 // checks into a separate block to make the more common case of few elements 3540 // faster. 3541 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3542 3543 // Some loops have a single integer induction variable, while other loops 3544 // don't. One example is c++ iterators that often have multiple pointer 3545 // induction variables. In the code below we also support a case where we 3546 // don't have a single induction variable. 3547 // 3548 // We try to obtain an induction variable from the original loop as hard 3549 // as possible. However if we don't find one that: 3550 // - is an integer 3551 // - counts from zero, stepping by one 3552 // - is the size of the widest induction variable type 3553 // then we create a new one. 3554 OldInduction = Legal->getPrimaryInduction(); 3555 Type *IdxTy = Legal->getWidestInductionType(); 3556 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3557 // The loop step is equal to the vectorization factor (num of SIMD elements) 3558 // times the unroll factor (num of SIMD instructions). 3559 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3560 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3561 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3562 Induction = 3563 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3564 getDebugLocFromInstOrOperands(OldInduction)); 3565 3566 // Emit phis for the new starting index of the scalar loop. 3567 createInductionResumeValues(Lp, CountRoundDown); 3568 3569 return completeLoopSkeleton(Lp, OrigLoopID); 3570 } 3571 3572 // Fix up external users of the induction variable. At this point, we are 3573 // in LCSSA form, with all external PHIs that use the IV having one input value, 3574 // coming from the remainder loop. We need those PHIs to also have a correct 3575 // value for the IV when arriving directly from the middle block. 3576 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3577 const InductionDescriptor &II, 3578 Value *CountRoundDown, Value *EndValue, 3579 BasicBlock *MiddleBlock) { 3580 // There are two kinds of external IV usages - those that use the value 3581 // computed in the last iteration (the PHI) and those that use the penultimate 3582 // value (the value that feeds into the phi from the loop latch). 3583 // We allow both, but they, obviously, have different values. 3584 3585 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3586 3587 DenseMap<Value *, Value *> MissingVals; 3588 3589 // An external user of the last iteration's value should see the value that 3590 // the remainder loop uses to initialize its own IV. 3591 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3592 for (User *U : PostInc->users()) { 3593 Instruction *UI = cast<Instruction>(U); 3594 if (!OrigLoop->contains(UI)) { 3595 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3596 MissingVals[UI] = EndValue; 3597 } 3598 } 3599 3600 // An external user of the penultimate value need to see EndValue - Step. 3601 // The simplest way to get this is to recompute it from the constituent SCEVs, 3602 // that is Start + (Step * (CRD - 1)). 3603 for (User *U : OrigPhi->users()) { 3604 auto *UI = cast<Instruction>(U); 3605 if (!OrigLoop->contains(UI)) { 3606 const DataLayout &DL = 3607 OrigLoop->getHeader()->getModule()->getDataLayout(); 3608 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3609 3610 IRBuilder<> B(MiddleBlock->getTerminator()); 3611 Value *CountMinusOne = B.CreateSub( 3612 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3613 Value *CMO = 3614 !II.getStep()->getType()->isIntegerTy() 3615 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3616 II.getStep()->getType()) 3617 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3618 CMO->setName("cast.cmo"); 3619 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3620 Escape->setName("ind.escape"); 3621 MissingVals[UI] = Escape; 3622 } 3623 } 3624 3625 for (auto &I : MissingVals) { 3626 PHINode *PHI = cast<PHINode>(I.first); 3627 // One corner case we have to handle is two IVs "chasing" each-other, 3628 // that is %IV2 = phi [...], [ %IV1, %latch ] 3629 // In this case, if IV1 has an external use, we need to avoid adding both 3630 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3631 // don't already have an incoming value for the middle block. 3632 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3633 PHI->addIncoming(I.second, MiddleBlock); 3634 } 3635 } 3636 3637 namespace { 3638 3639 struct CSEDenseMapInfo { 3640 static bool canHandle(const Instruction *I) { 3641 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3642 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3643 } 3644 3645 static inline Instruction *getEmptyKey() { 3646 return DenseMapInfo<Instruction *>::getEmptyKey(); 3647 } 3648 3649 static inline Instruction *getTombstoneKey() { 3650 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3651 } 3652 3653 static unsigned getHashValue(const Instruction *I) { 3654 assert(canHandle(I) && "Unknown instruction!"); 3655 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3656 I->value_op_end())); 3657 } 3658 3659 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3660 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3661 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3662 return LHS == RHS; 3663 return LHS->isIdenticalTo(RHS); 3664 } 3665 }; 3666 3667 } // end anonymous namespace 3668 3669 ///Perform cse of induction variable instructions. 3670 static void cse(BasicBlock *BB) { 3671 // Perform simple cse. 3672 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3673 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3674 Instruction *In = &*I++; 3675 3676 if (!CSEDenseMapInfo::canHandle(In)) 3677 continue; 3678 3679 // Check if we can replace this instruction with any of the 3680 // visited instructions. 3681 if (Instruction *V = CSEMap.lookup(In)) { 3682 In->replaceAllUsesWith(V); 3683 In->eraseFromParent(); 3684 continue; 3685 } 3686 3687 CSEMap[In] = In; 3688 } 3689 } 3690 3691 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3692 ElementCount VF, 3693 bool &NeedToScalarize) { 3694 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3695 Function *F = CI->getCalledFunction(); 3696 Type *ScalarRetTy = CI->getType(); 3697 SmallVector<Type *, 4> Tys, ScalarTys; 3698 for (auto &ArgOp : CI->arg_operands()) 3699 ScalarTys.push_back(ArgOp->getType()); 3700 3701 // Estimate cost of scalarized vector call. The source operands are assumed 3702 // to be vectors, so we need to extract individual elements from there, 3703 // execute VF scalar calls, and then gather the result into the vector return 3704 // value. 3705 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3706 TTI::TCK_RecipThroughput); 3707 if (VF.isScalar()) 3708 return ScalarCallCost; 3709 3710 // Compute corresponding vector type for return value and arguments. 3711 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3712 for (Type *ScalarTy : ScalarTys) 3713 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3714 3715 // Compute costs of unpacking argument values for the scalar calls and 3716 // packing the return values to a vector. 3717 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3718 3719 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3720 3721 // If we can't emit a vector call for this function, then the currently found 3722 // cost is the cost we need to return. 3723 NeedToScalarize = true; 3724 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3725 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3726 3727 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3728 return Cost; 3729 3730 // If the corresponding vector cost is cheaper, return its cost. 3731 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3732 TTI::TCK_RecipThroughput); 3733 if (VectorCallCost < Cost) { 3734 NeedToScalarize = false; 3735 return VectorCallCost; 3736 } 3737 return Cost; 3738 } 3739 3740 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3741 ElementCount VF) { 3742 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3743 assert(ID && "Expected intrinsic call!"); 3744 3745 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3746 return TTI.getIntrinsicInstrCost(CostAttrs, 3747 TargetTransformInfo::TCK_RecipThroughput); 3748 } 3749 3750 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3751 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3752 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3753 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3754 } 3755 3756 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3757 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3758 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3759 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3760 } 3761 3762 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3763 // For every instruction `I` in MinBWs, truncate the operands, create a 3764 // truncated version of `I` and reextend its result. InstCombine runs 3765 // later and will remove any ext/trunc pairs. 3766 SmallPtrSet<Value *, 4> Erased; 3767 for (const auto &KV : Cost->getMinimalBitwidths()) { 3768 // If the value wasn't vectorized, we must maintain the original scalar 3769 // type. The absence of the value from VectorLoopValueMap indicates that it 3770 // wasn't vectorized. 3771 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3772 continue; 3773 for (unsigned Part = 0; Part < UF; ++Part) { 3774 Value *I = getOrCreateVectorValue(KV.first, Part); 3775 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3776 continue; 3777 Type *OriginalTy = I->getType(); 3778 Type *ScalarTruncatedTy = 3779 IntegerType::get(OriginalTy->getContext(), KV.second); 3780 auto *TruncatedTy = FixedVectorType::get( 3781 ScalarTruncatedTy, 3782 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3783 if (TruncatedTy == OriginalTy) 3784 continue; 3785 3786 IRBuilder<> B(cast<Instruction>(I)); 3787 auto ShrinkOperand = [&](Value *V) -> Value * { 3788 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3789 if (ZI->getSrcTy() == TruncatedTy) 3790 return ZI->getOperand(0); 3791 return B.CreateZExtOrTrunc(V, TruncatedTy); 3792 }; 3793 3794 // The actual instruction modification depends on the instruction type, 3795 // unfortunately. 3796 Value *NewI = nullptr; 3797 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3798 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3799 ShrinkOperand(BO->getOperand(1))); 3800 3801 // Any wrapping introduced by shrinking this operation shouldn't be 3802 // considered undefined behavior. So, we can't unconditionally copy 3803 // arithmetic wrapping flags to NewI. 3804 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3805 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3806 NewI = 3807 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3808 ShrinkOperand(CI->getOperand(1))); 3809 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3810 NewI = B.CreateSelect(SI->getCondition(), 3811 ShrinkOperand(SI->getTrueValue()), 3812 ShrinkOperand(SI->getFalseValue())); 3813 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3814 switch (CI->getOpcode()) { 3815 default: 3816 llvm_unreachable("Unhandled cast!"); 3817 case Instruction::Trunc: 3818 NewI = ShrinkOperand(CI->getOperand(0)); 3819 break; 3820 case Instruction::SExt: 3821 NewI = B.CreateSExtOrTrunc( 3822 CI->getOperand(0), 3823 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3824 break; 3825 case Instruction::ZExt: 3826 NewI = B.CreateZExtOrTrunc( 3827 CI->getOperand(0), 3828 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3829 break; 3830 } 3831 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3832 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3833 ->getNumElements(); 3834 auto *O0 = B.CreateZExtOrTrunc( 3835 SI->getOperand(0), 3836 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3837 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3838 ->getNumElements(); 3839 auto *O1 = B.CreateZExtOrTrunc( 3840 SI->getOperand(1), 3841 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3842 3843 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3844 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3845 // Don't do anything with the operands, just extend the result. 3846 continue; 3847 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3848 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3849 ->getNumElements(); 3850 auto *O0 = B.CreateZExtOrTrunc( 3851 IE->getOperand(0), 3852 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3853 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3854 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3855 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3856 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3857 ->getNumElements(); 3858 auto *O0 = B.CreateZExtOrTrunc( 3859 EE->getOperand(0), 3860 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3861 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3862 } else { 3863 // If we don't know what to do, be conservative and don't do anything. 3864 continue; 3865 } 3866 3867 // Lastly, extend the result. 3868 NewI->takeName(cast<Instruction>(I)); 3869 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3870 I->replaceAllUsesWith(Res); 3871 cast<Instruction>(I)->eraseFromParent(); 3872 Erased.insert(I); 3873 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3874 } 3875 } 3876 3877 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3878 for (const auto &KV : Cost->getMinimalBitwidths()) { 3879 // If the value wasn't vectorized, we must maintain the original scalar 3880 // type. The absence of the value from VectorLoopValueMap indicates that it 3881 // wasn't vectorized. 3882 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3883 continue; 3884 for (unsigned Part = 0; Part < UF; ++Part) { 3885 Value *I = getOrCreateVectorValue(KV.first, Part); 3886 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3887 if (Inst && Inst->use_empty()) { 3888 Value *NewI = Inst->getOperand(0); 3889 Inst->eraseFromParent(); 3890 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3891 } 3892 } 3893 } 3894 } 3895 3896 void InnerLoopVectorizer::fixVectorizedLoop() { 3897 // Insert truncates and extends for any truncated instructions as hints to 3898 // InstCombine. 3899 if (VF.isVector()) 3900 truncateToMinimalBitwidths(); 3901 3902 // Fix widened non-induction PHIs by setting up the PHI operands. 3903 if (OrigPHIsToFix.size()) { 3904 assert(EnableVPlanNativePath && 3905 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3906 fixNonInductionPHIs(); 3907 } 3908 3909 // At this point every instruction in the original loop is widened to a 3910 // vector form. Now we need to fix the recurrences in the loop. These PHI 3911 // nodes are currently empty because we did not want to introduce cycles. 3912 // This is the second stage of vectorizing recurrences. 3913 fixCrossIterationPHIs(); 3914 3915 // Forget the original basic block. 3916 PSE.getSE()->forgetLoop(OrigLoop); 3917 3918 // Fix-up external users of the induction variables. 3919 for (auto &Entry : Legal->getInductionVars()) 3920 fixupIVUsers(Entry.first, Entry.second, 3921 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3922 IVEndValues[Entry.first], LoopMiddleBlock); 3923 3924 fixLCSSAPHIs(); 3925 for (Instruction *PI : PredicatedInstructions) 3926 sinkScalarOperands(&*PI); 3927 3928 // Remove redundant induction instructions. 3929 cse(LoopVectorBody); 3930 3931 // Set/update profile weights for the vector and remainder loops as original 3932 // loop iterations are now distributed among them. Note that original loop 3933 // represented by LoopScalarBody becomes remainder loop after vectorization. 3934 // 3935 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3936 // end up getting slightly roughened result but that should be OK since 3937 // profile is not inherently precise anyway. Note also possible bypass of 3938 // vector code caused by legality checks is ignored, assigning all the weight 3939 // to the vector loop, optimistically. 3940 // 3941 // For scalable vectorization we can't know at compile time how many iterations 3942 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3943 // vscale of '1'. 3944 setProfileInfoAfterUnrolling( 3945 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3946 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3947 } 3948 3949 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3950 // In order to support recurrences we need to be able to vectorize Phi nodes. 3951 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3952 // stage #2: We now need to fix the recurrences by adding incoming edges to 3953 // the currently empty PHI nodes. At this point every instruction in the 3954 // original loop is widened to a vector form so we can use them to construct 3955 // the incoming edges. 3956 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3957 // Handle first-order recurrences and reductions that need to be fixed. 3958 if (Legal->isFirstOrderRecurrence(&Phi)) 3959 fixFirstOrderRecurrence(&Phi); 3960 else if (Legal->isReductionVariable(&Phi)) 3961 fixReduction(&Phi); 3962 } 3963 } 3964 3965 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3966 // This is the second phase of vectorizing first-order recurrences. An 3967 // overview of the transformation is described below. Suppose we have the 3968 // following loop. 3969 // 3970 // for (int i = 0; i < n; ++i) 3971 // b[i] = a[i] - a[i - 1]; 3972 // 3973 // There is a first-order recurrence on "a". For this loop, the shorthand 3974 // scalar IR looks like: 3975 // 3976 // scalar.ph: 3977 // s_init = a[-1] 3978 // br scalar.body 3979 // 3980 // scalar.body: 3981 // i = phi [0, scalar.ph], [i+1, scalar.body] 3982 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3983 // s2 = a[i] 3984 // b[i] = s2 - s1 3985 // br cond, scalar.body, ... 3986 // 3987 // In this example, s1 is a recurrence because it's value depends on the 3988 // previous iteration. In the first phase of vectorization, we created a 3989 // temporary value for s1. We now complete the vectorization and produce the 3990 // shorthand vector IR shown below (for VF = 4, UF = 1). 3991 // 3992 // vector.ph: 3993 // v_init = vector(..., ..., ..., a[-1]) 3994 // br vector.body 3995 // 3996 // vector.body 3997 // i = phi [0, vector.ph], [i+4, vector.body] 3998 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3999 // v2 = a[i, i+1, i+2, i+3]; 4000 // v3 = vector(v1(3), v2(0, 1, 2)) 4001 // b[i, i+1, i+2, i+3] = v2 - v3 4002 // br cond, vector.body, middle.block 4003 // 4004 // middle.block: 4005 // x = v2(3) 4006 // br scalar.ph 4007 // 4008 // scalar.ph: 4009 // s_init = phi [x, middle.block], [a[-1], otherwise] 4010 // br scalar.body 4011 // 4012 // After execution completes the vector loop, we extract the next value of 4013 // the recurrence (x) to use as the initial value in the scalar loop. 4014 4015 // Get the original loop preheader and single loop latch. 4016 auto *Preheader = OrigLoop->getLoopPreheader(); 4017 auto *Latch = OrigLoop->getLoopLatch(); 4018 4019 // Get the initial and previous values of the scalar recurrence. 4020 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4021 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4022 4023 // Create a vector from the initial value. 4024 auto *VectorInit = ScalarInit; 4025 if (VF.isVector()) { 4026 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4027 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4028 VectorInit = Builder.CreateInsertElement( 4029 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4030 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4031 } 4032 4033 // We constructed a temporary phi node in the first phase of vectorization. 4034 // This phi node will eventually be deleted. 4035 Builder.SetInsertPoint( 4036 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4037 4038 // Create a phi node for the new recurrence. The current value will either be 4039 // the initial value inserted into a vector or loop-varying vector value. 4040 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4041 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4042 4043 // Get the vectorized previous value of the last part UF - 1. It appears last 4044 // among all unrolled iterations, due to the order of their construction. 4045 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4046 4047 // Find and set the insertion point after the previous value if it is an 4048 // instruction. 4049 BasicBlock::iterator InsertPt; 4050 // Note that the previous value may have been constant-folded so it is not 4051 // guaranteed to be an instruction in the vector loop. 4052 // FIXME: Loop invariant values do not form recurrences. We should deal with 4053 // them earlier. 4054 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4055 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4056 else { 4057 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4058 if (isa<PHINode>(PreviousLastPart)) 4059 // If the previous value is a phi node, we should insert after all the phi 4060 // nodes in the block containing the PHI to avoid breaking basic block 4061 // verification. Note that the basic block may be different to 4062 // LoopVectorBody, in case we predicate the loop. 4063 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4064 else 4065 InsertPt = ++PreviousInst->getIterator(); 4066 } 4067 Builder.SetInsertPoint(&*InsertPt); 4068 4069 // We will construct a vector for the recurrence by combining the values for 4070 // the current and previous iterations. This is the required shuffle mask. 4071 assert(!VF.isScalable()); 4072 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4073 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4074 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4075 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4076 4077 // The vector from which to take the initial value for the current iteration 4078 // (actual or unrolled). Initially, this is the vector phi node. 4079 Value *Incoming = VecPhi; 4080 4081 // Shuffle the current and previous vector and update the vector parts. 4082 for (unsigned Part = 0; Part < UF; ++Part) { 4083 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4084 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4085 auto *Shuffle = 4086 VF.isVector() 4087 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4088 : Incoming; 4089 PhiPart->replaceAllUsesWith(Shuffle); 4090 cast<Instruction>(PhiPart)->eraseFromParent(); 4091 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4092 Incoming = PreviousPart; 4093 } 4094 4095 // Fix the latch value of the new recurrence in the vector loop. 4096 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4097 4098 // Extract the last vector element in the middle block. This will be the 4099 // initial value for the recurrence when jumping to the scalar loop. 4100 auto *ExtractForScalar = Incoming; 4101 if (VF.isVector()) { 4102 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4103 ExtractForScalar = Builder.CreateExtractElement( 4104 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4105 "vector.recur.extract"); 4106 } 4107 // Extract the second last element in the middle block if the 4108 // Phi is used outside the loop. We need to extract the phi itself 4109 // and not the last element (the phi update in the current iteration). This 4110 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4111 // when the scalar loop is not run at all. 4112 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4113 if (VF.isVector()) 4114 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4115 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4116 "vector.recur.extract.for.phi"); 4117 // When loop is unrolled without vectorizing, initialize 4118 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4119 // `Incoming`. This is analogous to the vectorized case above: extracting the 4120 // second last element when VF > 1. 4121 else if (UF > 1) 4122 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4123 4124 // Fix the initial value of the original recurrence in the scalar loop. 4125 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4126 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4127 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4128 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4129 Start->addIncoming(Incoming, BB); 4130 } 4131 4132 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4133 Phi->setName("scalar.recur"); 4134 4135 // Finally, fix users of the recurrence outside the loop. The users will need 4136 // either the last value of the scalar recurrence or the last value of the 4137 // vector recurrence we extracted in the middle block. Since the loop is in 4138 // LCSSA form, we just need to find all the phi nodes for the original scalar 4139 // recurrence in the exit block, and then add an edge for the middle block. 4140 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4141 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4142 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4143 } 4144 } 4145 } 4146 4147 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4148 Constant *Zero = Builder.getInt32(0); 4149 4150 // Get it's reduction variable descriptor. 4151 assert(Legal->isReductionVariable(Phi) && 4152 "Unable to find the reduction variable"); 4153 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4154 4155 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4156 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4157 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4158 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 4159 RdxDesc.getMinMaxRecurrenceKind(); 4160 setDebugLocFromInst(Builder, ReductionStartValue); 4161 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4162 4163 // We need to generate a reduction vector from the incoming scalar. 4164 // To do so, we need to generate the 'identity' vector and override 4165 // one of the elements with the incoming scalar reduction. We need 4166 // to do it in the vector-loop preheader. 4167 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4168 4169 // This is the vector-clone of the value that leaves the loop. 4170 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4171 4172 // Find the reduction identity variable. Zero for addition, or, xor, 4173 // one for multiplication, -1 for And. 4174 Value *Identity; 4175 Value *VectorStart; 4176 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 4177 RK == RecurrenceDescriptor::RK_FloatMinMax) { 4178 // MinMax reduction have the start value as their identify. 4179 if (VF.isScalar() || IsInLoopReductionPhi) { 4180 VectorStart = Identity = ReductionStartValue; 4181 } else { 4182 VectorStart = Identity = 4183 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4184 } 4185 } else { 4186 // Handle other reduction kinds: 4187 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4188 RK, MinMaxKind, VecTy->getScalarType()); 4189 if (VF.isScalar() || IsInLoopReductionPhi) { 4190 Identity = Iden; 4191 // This vector is the Identity vector where the first element is the 4192 // incoming scalar reduction. 4193 VectorStart = ReductionStartValue; 4194 } else { 4195 Identity = ConstantVector::getSplat(VF, Iden); 4196 4197 // This vector is the Identity vector where the first element is the 4198 // incoming scalar reduction. 4199 VectorStart = 4200 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4201 } 4202 } 4203 4204 // Wrap flags are in general invalid after vectorization, clear them. 4205 clearReductionWrapFlags(RdxDesc); 4206 4207 // Fix the vector-loop phi. 4208 4209 // Reductions do not have to start at zero. They can start with 4210 // any loop invariant values. 4211 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4212 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4213 4214 for (unsigned Part = 0; Part < UF; ++Part) { 4215 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4216 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4217 // Make sure to add the reduction start value only to the 4218 // first unroll part. 4219 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4220 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4221 cast<PHINode>(VecRdxPhi) 4222 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4223 } 4224 4225 // Before each round, move the insertion point right between 4226 // the PHIs and the values we are going to write. 4227 // This allows us to write both PHINodes and the extractelement 4228 // instructions. 4229 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4230 4231 setDebugLocFromInst(Builder, LoopExitInst); 4232 4233 // If tail is folded by masking, the vector value to leave the loop should be 4234 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4235 // instead of the former. For an inloop reduction the reduction will already 4236 // be predicated, and does not need to be handled here. 4237 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4238 for (unsigned Part = 0; Part < UF; ++Part) { 4239 Value *VecLoopExitInst = 4240 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4241 Value *Sel = nullptr; 4242 for (User *U : VecLoopExitInst->users()) { 4243 if (isa<SelectInst>(U)) { 4244 assert(!Sel && "Reduction exit feeding two selects"); 4245 Sel = U; 4246 } else 4247 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4248 } 4249 assert(Sel && "Reduction exit feeds no select"); 4250 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4251 4252 // If the target can create a predicated operator for the reduction at no 4253 // extra cost in the loop (for example a predicated vadd), it can be 4254 // cheaper for the select to remain in the loop than be sunk out of it, 4255 // and so use the select value for the phi instead of the old 4256 // LoopExitValue. 4257 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4258 if (PreferPredicatedReductionSelect || 4259 TTI->preferPredicatedReductionSelect( 4260 RdxDesc.getRecurrenceBinOp(), Phi->getType(), 4261 TargetTransformInfo::ReductionFlags())) { 4262 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4263 VecRdxPhi->setIncomingValueForBlock( 4264 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4265 } 4266 } 4267 } 4268 4269 // If the vector reduction can be performed in a smaller type, we truncate 4270 // then extend the loop exit value to enable InstCombine to evaluate the 4271 // entire expression in the smaller type. 4272 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4273 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4274 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4275 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4276 Builder.SetInsertPoint( 4277 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4278 VectorParts RdxParts(UF); 4279 for (unsigned Part = 0; Part < UF; ++Part) { 4280 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4281 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4282 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4283 : Builder.CreateZExt(Trunc, VecTy); 4284 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4285 UI != RdxParts[Part]->user_end();) 4286 if (*UI != Trunc) { 4287 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4288 RdxParts[Part] = Extnd; 4289 } else { 4290 ++UI; 4291 } 4292 } 4293 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4294 for (unsigned Part = 0; Part < UF; ++Part) { 4295 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4296 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4297 } 4298 } 4299 4300 // Reduce all of the unrolled parts into a single vector. 4301 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4302 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4303 4304 // The middle block terminator has already been assigned a DebugLoc here (the 4305 // OrigLoop's single latch terminator). We want the whole middle block to 4306 // appear to execute on this line because: (a) it is all compiler generated, 4307 // (b) these instructions are always executed after evaluating the latch 4308 // conditional branch, and (c) other passes may add new predecessors which 4309 // terminate on this line. This is the easiest way to ensure we don't 4310 // accidentally cause an extra step back into the loop while debugging. 4311 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4312 for (unsigned Part = 1; Part < UF; ++Part) { 4313 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4314 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4315 // Floating point operations had to be 'fast' to enable the reduction. 4316 ReducedPartRdx = addFastMathFlag( 4317 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4318 ReducedPartRdx, "bin.rdx"), 4319 RdxDesc.getFastMathFlags()); 4320 else 4321 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4322 RdxPart); 4323 } 4324 4325 // Create the reduction after the loop. Note that inloop reductions create the 4326 // target reduction in the loop using a Reduction recipe. 4327 if (VF.isVector() && !IsInLoopReductionPhi) { 4328 bool NoNaN = Legal->hasFunNoNaNAttr(); 4329 ReducedPartRdx = 4330 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4331 // If the reduction can be performed in a smaller type, we need to extend 4332 // the reduction to the wider type before we branch to the original loop. 4333 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4334 ReducedPartRdx = 4335 RdxDesc.isSigned() 4336 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4337 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4338 } 4339 4340 // Create a phi node that merges control-flow from the backedge-taken check 4341 // block and the middle block. 4342 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4343 LoopScalarPreHeader->getTerminator()); 4344 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4345 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4346 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4347 4348 // Now, we need to fix the users of the reduction variable 4349 // inside and outside of the scalar remainder loop. 4350 // We know that the loop is in LCSSA form. We need to update the 4351 // PHI nodes in the exit blocks. 4352 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4353 // All PHINodes need to have a single entry edge, or two if 4354 // we already fixed them. 4355 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4356 4357 // We found a reduction value exit-PHI. Update it with the 4358 // incoming bypass edge. 4359 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4360 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4361 } // end of the LCSSA phi scan. 4362 4363 // Fix the scalar loop reduction variable with the incoming reduction sum 4364 // from the vector body and from the backedge value. 4365 int IncomingEdgeBlockIdx = 4366 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4367 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4368 // Pick the other block. 4369 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4370 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4371 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4372 } 4373 4374 void InnerLoopVectorizer::clearReductionWrapFlags( 4375 RecurrenceDescriptor &RdxDesc) { 4376 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4377 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4378 RK != RecurrenceDescriptor::RK_IntegerMult) 4379 return; 4380 4381 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4382 assert(LoopExitInstr && "null loop exit instruction"); 4383 SmallVector<Instruction *, 8> Worklist; 4384 SmallPtrSet<Instruction *, 8> Visited; 4385 Worklist.push_back(LoopExitInstr); 4386 Visited.insert(LoopExitInstr); 4387 4388 while (!Worklist.empty()) { 4389 Instruction *Cur = Worklist.pop_back_val(); 4390 if (isa<OverflowingBinaryOperator>(Cur)) 4391 for (unsigned Part = 0; Part < UF; ++Part) { 4392 Value *V = getOrCreateVectorValue(Cur, Part); 4393 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4394 } 4395 4396 for (User *U : Cur->users()) { 4397 Instruction *UI = cast<Instruction>(U); 4398 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4399 Visited.insert(UI).second) 4400 Worklist.push_back(UI); 4401 } 4402 } 4403 } 4404 4405 void InnerLoopVectorizer::fixLCSSAPHIs() { 4406 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4407 if (LCSSAPhi.getNumIncomingValues() == 1) { 4408 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4409 // Non-instruction incoming values will have only one value. 4410 unsigned LastLane = 0; 4411 if (isa<Instruction>(IncomingValue)) 4412 LastLane = Cost->isUniformAfterVectorization( 4413 cast<Instruction>(IncomingValue), VF) 4414 ? 0 4415 : VF.getKnownMinValue() - 1; 4416 assert((!VF.isScalable() || LastLane == 0) && 4417 "scalable vectors dont support non-uniform scalars yet"); 4418 // Can be a loop invariant incoming value or the last scalar value to be 4419 // extracted from the vectorized loop. 4420 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4421 Value *lastIncomingValue = 4422 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4423 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4424 } 4425 } 4426 } 4427 4428 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4429 // The basic block and loop containing the predicated instruction. 4430 auto *PredBB = PredInst->getParent(); 4431 auto *VectorLoop = LI->getLoopFor(PredBB); 4432 4433 // Initialize a worklist with the operands of the predicated instruction. 4434 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4435 4436 // Holds instructions that we need to analyze again. An instruction may be 4437 // reanalyzed if we don't yet know if we can sink it or not. 4438 SmallVector<Instruction *, 8> InstsToReanalyze; 4439 4440 // Returns true if a given use occurs in the predicated block. Phi nodes use 4441 // their operands in their corresponding predecessor blocks. 4442 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4443 auto *I = cast<Instruction>(U.getUser()); 4444 BasicBlock *BB = I->getParent(); 4445 if (auto *Phi = dyn_cast<PHINode>(I)) 4446 BB = Phi->getIncomingBlock( 4447 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4448 return BB == PredBB; 4449 }; 4450 4451 // Iteratively sink the scalarized operands of the predicated instruction 4452 // into the block we created for it. When an instruction is sunk, it's 4453 // operands are then added to the worklist. The algorithm ends after one pass 4454 // through the worklist doesn't sink a single instruction. 4455 bool Changed; 4456 do { 4457 // Add the instructions that need to be reanalyzed to the worklist, and 4458 // reset the changed indicator. 4459 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4460 InstsToReanalyze.clear(); 4461 Changed = false; 4462 4463 while (!Worklist.empty()) { 4464 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4465 4466 // We can't sink an instruction if it is a phi node, is already in the 4467 // predicated block, is not in the loop, or may have side effects. 4468 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4469 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4470 continue; 4471 4472 // It's legal to sink the instruction if all its uses occur in the 4473 // predicated block. Otherwise, there's nothing to do yet, and we may 4474 // need to reanalyze the instruction. 4475 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4476 InstsToReanalyze.push_back(I); 4477 continue; 4478 } 4479 4480 // Move the instruction to the beginning of the predicated block, and add 4481 // it's operands to the worklist. 4482 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4483 Worklist.insert(I->op_begin(), I->op_end()); 4484 4485 // The sinking may have enabled other instructions to be sunk, so we will 4486 // need to iterate. 4487 Changed = true; 4488 } 4489 } while (Changed); 4490 } 4491 4492 void InnerLoopVectorizer::fixNonInductionPHIs() { 4493 for (PHINode *OrigPhi : OrigPHIsToFix) { 4494 PHINode *NewPhi = 4495 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4496 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4497 4498 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4499 predecessors(OrigPhi->getParent())); 4500 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4501 predecessors(NewPhi->getParent())); 4502 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4503 "Scalar and Vector BB should have the same number of predecessors"); 4504 4505 // The insertion point in Builder may be invalidated by the time we get 4506 // here. Force the Builder insertion point to something valid so that we do 4507 // not run into issues during insertion point restore in 4508 // getOrCreateVectorValue calls below. 4509 Builder.SetInsertPoint(NewPhi); 4510 4511 // The predecessor order is preserved and we can rely on mapping between 4512 // scalar and vector block predecessors. 4513 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4514 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4515 4516 // When looking up the new scalar/vector values to fix up, use incoming 4517 // values from original phi. 4518 Value *ScIncV = 4519 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4520 4521 // Scalar incoming value may need a broadcast 4522 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4523 NewPhi->addIncoming(NewIncV, NewPredBB); 4524 } 4525 } 4526 } 4527 4528 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4529 VPUser &Operands, unsigned UF, 4530 ElementCount VF, bool IsPtrLoopInvariant, 4531 SmallBitVector &IsIndexLoopInvariant, 4532 VPTransformState &State) { 4533 // Construct a vector GEP by widening the operands of the scalar GEP as 4534 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4535 // results in a vector of pointers when at least one operand of the GEP 4536 // is vector-typed. Thus, to keep the representation compact, we only use 4537 // vector-typed operands for loop-varying values. 4538 4539 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4540 // If we are vectorizing, but the GEP has only loop-invariant operands, 4541 // the GEP we build (by only using vector-typed operands for 4542 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4543 // produce a vector of pointers, we need to either arbitrarily pick an 4544 // operand to broadcast, or broadcast a clone of the original GEP. 4545 // Here, we broadcast a clone of the original. 4546 // 4547 // TODO: If at some point we decide to scalarize instructions having 4548 // loop-invariant operands, this special case will no longer be 4549 // required. We would add the scalarization decision to 4550 // collectLoopScalars() and teach getVectorValue() to broadcast 4551 // the lane-zero scalar value. 4552 auto *Clone = Builder.Insert(GEP->clone()); 4553 for (unsigned Part = 0; Part < UF; ++Part) { 4554 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4555 State.set(VPDef, GEP, EntryPart, Part); 4556 addMetadata(EntryPart, GEP); 4557 } 4558 } else { 4559 // If the GEP has at least one loop-varying operand, we are sure to 4560 // produce a vector of pointers. But if we are only unrolling, we want 4561 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4562 // produce with the code below will be scalar (if VF == 1) or vector 4563 // (otherwise). Note that for the unroll-only case, we still maintain 4564 // values in the vector mapping with initVector, as we do for other 4565 // instructions. 4566 for (unsigned Part = 0; Part < UF; ++Part) { 4567 // The pointer operand of the new GEP. If it's loop-invariant, we 4568 // won't broadcast it. 4569 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4570 : State.get(Operands.getOperand(0), Part); 4571 4572 // Collect all the indices for the new GEP. If any index is 4573 // loop-invariant, we won't broadcast it. 4574 SmallVector<Value *, 4> Indices; 4575 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4576 VPValue *Operand = Operands.getOperand(I); 4577 if (IsIndexLoopInvariant[I - 1]) 4578 Indices.push_back(State.get(Operand, {0, 0})); 4579 else 4580 Indices.push_back(State.get(Operand, Part)); 4581 } 4582 4583 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4584 // but it should be a vector, otherwise. 4585 auto *NewGEP = 4586 GEP->isInBounds() 4587 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4588 Indices) 4589 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4590 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4591 "NewGEP is not a pointer vector"); 4592 State.set(VPDef, GEP, NewGEP, Part); 4593 addMetadata(NewGEP, GEP); 4594 } 4595 } 4596 } 4597 4598 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4599 ElementCount VF) { 4600 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4601 PHINode *P = cast<PHINode>(PN); 4602 if (EnableVPlanNativePath) { 4603 // Currently we enter here in the VPlan-native path for non-induction 4604 // PHIs where all control flow is uniform. We simply widen these PHIs. 4605 // Create a vector phi with no operands - the vector phi operands will be 4606 // set at the end of vector code generation. 4607 Type *VecTy = 4608 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4609 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4610 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4611 OrigPHIsToFix.push_back(P); 4612 4613 return; 4614 } 4615 4616 assert(PN->getParent() == OrigLoop->getHeader() && 4617 "Non-header phis should have been handled elsewhere"); 4618 4619 // In order to support recurrences we need to be able to vectorize Phi nodes. 4620 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4621 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4622 // this value when we vectorize all of the instructions that use the PHI. 4623 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4624 for (unsigned Part = 0; Part < UF; ++Part) { 4625 // This is phase one of vectorizing PHIs. 4626 bool ScalarPHI = 4627 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4628 Type *VecTy = 4629 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4630 Value *EntryPart = PHINode::Create( 4631 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4632 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4633 } 4634 return; 4635 } 4636 4637 setDebugLocFromInst(Builder, P); 4638 4639 // This PHINode must be an induction variable. 4640 // Make sure that we know about it. 4641 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4642 4643 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4644 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4645 4646 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4647 // which can be found from the original scalar operations. 4648 switch (II.getKind()) { 4649 case InductionDescriptor::IK_NoInduction: 4650 llvm_unreachable("Unknown induction"); 4651 case InductionDescriptor::IK_IntInduction: 4652 case InductionDescriptor::IK_FpInduction: 4653 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4654 case InductionDescriptor::IK_PtrInduction: { 4655 // Handle the pointer induction variable case. 4656 assert(P->getType()->isPointerTy() && "Unexpected type."); 4657 4658 if (Cost->isScalarAfterVectorization(P, VF)) { 4659 // This is the normalized GEP that starts counting at zero. 4660 Value *PtrInd = 4661 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4662 // Determine the number of scalars we need to generate for each unroll 4663 // iteration. If the instruction is uniform, we only need to generate the 4664 // first lane. Otherwise, we generate all VF values. 4665 unsigned Lanes = 4666 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4667 for (unsigned Part = 0; Part < UF; ++Part) { 4668 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4669 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4670 Lane + Part * VF.getKnownMinValue()); 4671 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4672 Value *SclrGep = 4673 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4674 SclrGep->setName("next.gep"); 4675 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4676 } 4677 } 4678 return; 4679 } 4680 assert(isa<SCEVConstant>(II.getStep()) && 4681 "Induction step not a SCEV constant!"); 4682 Type *PhiType = II.getStep()->getType(); 4683 4684 // Build a pointer phi 4685 Value *ScalarStartValue = II.getStartValue(); 4686 Type *ScStValueType = ScalarStartValue->getType(); 4687 PHINode *NewPointerPhi = 4688 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4689 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4690 4691 // A pointer induction, performed by using a gep 4692 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4693 Instruction *InductionLoc = LoopLatch->getTerminator(); 4694 const SCEV *ScalarStep = II.getStep(); 4695 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4696 Value *ScalarStepValue = 4697 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4698 Value *InductionGEP = GetElementPtrInst::Create( 4699 ScStValueType->getPointerElementType(), NewPointerPhi, 4700 Builder.CreateMul( 4701 ScalarStepValue, 4702 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4703 "ptr.ind", InductionLoc); 4704 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4705 4706 // Create UF many actual address geps that use the pointer 4707 // phi as base and a vectorized version of the step value 4708 // (<step*0, ..., step*N>) as offset. 4709 for (unsigned Part = 0; Part < UF; ++Part) { 4710 SmallVector<Constant *, 8> Indices; 4711 // Create a vector of consecutive numbers from zero to VF. 4712 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4713 Indices.push_back( 4714 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4715 Constant *StartOffset = ConstantVector::get(Indices); 4716 4717 Value *GEP = Builder.CreateGEP( 4718 ScStValueType->getPointerElementType(), NewPointerPhi, 4719 Builder.CreateMul( 4720 StartOffset, 4721 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4722 "vector.gep")); 4723 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4724 } 4725 } 4726 } 4727 } 4728 4729 /// A helper function for checking whether an integer division-related 4730 /// instruction may divide by zero (in which case it must be predicated if 4731 /// executed conditionally in the scalar code). 4732 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4733 /// Non-zero divisors that are non compile-time constants will not be 4734 /// converted into multiplication, so we will still end up scalarizing 4735 /// the division, but can do so w/o predication. 4736 static bool mayDivideByZero(Instruction &I) { 4737 assert((I.getOpcode() == Instruction::UDiv || 4738 I.getOpcode() == Instruction::SDiv || 4739 I.getOpcode() == Instruction::URem || 4740 I.getOpcode() == Instruction::SRem) && 4741 "Unexpected instruction"); 4742 Value *Divisor = I.getOperand(1); 4743 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4744 return !CInt || CInt->isZero(); 4745 } 4746 4747 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4748 VPUser &User, 4749 VPTransformState &State) { 4750 switch (I.getOpcode()) { 4751 case Instruction::Call: 4752 case Instruction::Br: 4753 case Instruction::PHI: 4754 case Instruction::GetElementPtr: 4755 case Instruction::Select: 4756 llvm_unreachable("This instruction is handled by a different recipe."); 4757 case Instruction::UDiv: 4758 case Instruction::SDiv: 4759 case Instruction::SRem: 4760 case Instruction::URem: 4761 case Instruction::Add: 4762 case Instruction::FAdd: 4763 case Instruction::Sub: 4764 case Instruction::FSub: 4765 case Instruction::FNeg: 4766 case Instruction::Mul: 4767 case Instruction::FMul: 4768 case Instruction::FDiv: 4769 case Instruction::FRem: 4770 case Instruction::Shl: 4771 case Instruction::LShr: 4772 case Instruction::AShr: 4773 case Instruction::And: 4774 case Instruction::Or: 4775 case Instruction::Xor: { 4776 // Just widen unops and binops. 4777 setDebugLocFromInst(Builder, &I); 4778 4779 for (unsigned Part = 0; Part < UF; ++Part) { 4780 SmallVector<Value *, 2> Ops; 4781 for (VPValue *VPOp : User.operands()) 4782 Ops.push_back(State.get(VPOp, Part)); 4783 4784 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4785 4786 if (auto *VecOp = dyn_cast<Instruction>(V)) 4787 VecOp->copyIRFlags(&I); 4788 4789 // Use this vector value for all users of the original instruction. 4790 State.set(Def, &I, V, Part); 4791 addMetadata(V, &I); 4792 } 4793 4794 break; 4795 } 4796 case Instruction::ICmp: 4797 case Instruction::FCmp: { 4798 // Widen compares. Generate vector compares. 4799 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4800 auto *Cmp = cast<CmpInst>(&I); 4801 setDebugLocFromInst(Builder, Cmp); 4802 for (unsigned Part = 0; Part < UF; ++Part) { 4803 Value *A = State.get(User.getOperand(0), Part); 4804 Value *B = State.get(User.getOperand(1), Part); 4805 Value *C = nullptr; 4806 if (FCmp) { 4807 // Propagate fast math flags. 4808 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4809 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4810 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4811 } else { 4812 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4813 } 4814 State.set(Def, &I, C, Part); 4815 addMetadata(C, &I); 4816 } 4817 4818 break; 4819 } 4820 4821 case Instruction::ZExt: 4822 case Instruction::SExt: 4823 case Instruction::FPToUI: 4824 case Instruction::FPToSI: 4825 case Instruction::FPExt: 4826 case Instruction::PtrToInt: 4827 case Instruction::IntToPtr: 4828 case Instruction::SIToFP: 4829 case Instruction::UIToFP: 4830 case Instruction::Trunc: 4831 case Instruction::FPTrunc: 4832 case Instruction::BitCast: { 4833 auto *CI = cast<CastInst>(&I); 4834 setDebugLocFromInst(Builder, CI); 4835 4836 /// Vectorize casts. 4837 Type *DestTy = 4838 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4839 4840 for (unsigned Part = 0; Part < UF; ++Part) { 4841 Value *A = State.get(User.getOperand(0), Part); 4842 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4843 State.set(Def, &I, Cast, Part); 4844 addMetadata(Cast, &I); 4845 } 4846 break; 4847 } 4848 default: 4849 // This instruction is not vectorized by simple widening. 4850 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4851 llvm_unreachable("Unhandled instruction!"); 4852 } // end of switch. 4853 } 4854 4855 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4856 VPUser &ArgOperands, 4857 VPTransformState &State) { 4858 assert(!isa<DbgInfoIntrinsic>(I) && 4859 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4860 setDebugLocFromInst(Builder, &I); 4861 4862 Module *M = I.getParent()->getParent()->getParent(); 4863 auto *CI = cast<CallInst>(&I); 4864 4865 SmallVector<Type *, 4> Tys; 4866 for (Value *ArgOperand : CI->arg_operands()) 4867 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4868 4869 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4870 4871 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4872 // version of the instruction. 4873 // Is it beneficial to perform intrinsic call compared to lib call? 4874 bool NeedToScalarize = false; 4875 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4876 bool UseVectorIntrinsic = 4877 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4878 assert((UseVectorIntrinsic || !NeedToScalarize) && 4879 "Instruction should be scalarized elsewhere."); 4880 4881 for (unsigned Part = 0; Part < UF; ++Part) { 4882 SmallVector<Value *, 4> Args; 4883 for (auto &I : enumerate(ArgOperands.operands())) { 4884 // Some intrinsics have a scalar argument - don't replace it with a 4885 // vector. 4886 Value *Arg; 4887 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4888 Arg = State.get(I.value(), Part); 4889 else 4890 Arg = State.get(I.value(), {0, 0}); 4891 Args.push_back(Arg); 4892 } 4893 4894 Function *VectorF; 4895 if (UseVectorIntrinsic) { 4896 // Use vector version of the intrinsic. 4897 Type *TysForDecl[] = {CI->getType()}; 4898 if (VF.isVector()) { 4899 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4900 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4901 } 4902 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4903 assert(VectorF && "Can't retrieve vector intrinsic."); 4904 } else { 4905 // Use vector version of the function call. 4906 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4907 #ifndef NDEBUG 4908 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4909 "Can't create vector function."); 4910 #endif 4911 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4912 } 4913 SmallVector<OperandBundleDef, 1> OpBundles; 4914 CI->getOperandBundlesAsDefs(OpBundles); 4915 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4916 4917 if (isa<FPMathOperator>(V)) 4918 V->copyFastMathFlags(CI); 4919 4920 State.set(Def, &I, V, Part); 4921 addMetadata(V, &I); 4922 } 4923 } 4924 4925 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4926 VPUser &Operands, 4927 bool InvariantCond, 4928 VPTransformState &State) { 4929 setDebugLocFromInst(Builder, &I); 4930 4931 // The condition can be loop invariant but still defined inside the 4932 // loop. This means that we can't just use the original 'cond' value. 4933 // We have to take the 'vectorized' value and pick the first lane. 4934 // Instcombine will make this a no-op. 4935 auto *InvarCond = 4936 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4937 4938 for (unsigned Part = 0; Part < UF; ++Part) { 4939 Value *Cond = 4940 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4941 Value *Op0 = State.get(Operands.getOperand(1), Part); 4942 Value *Op1 = State.get(Operands.getOperand(2), Part); 4943 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4944 State.set(VPDef, &I, Sel, Part); 4945 addMetadata(Sel, &I); 4946 } 4947 } 4948 4949 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4950 // We should not collect Scalars more than once per VF. Right now, this 4951 // function is called from collectUniformsAndScalars(), which already does 4952 // this check. Collecting Scalars for VF=1 does not make any sense. 4953 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4954 "This function should not be visited twice for the same VF"); 4955 4956 SmallSetVector<Instruction *, 8> Worklist; 4957 4958 // These sets are used to seed the analysis with pointers used by memory 4959 // accesses that will remain scalar. 4960 SmallSetVector<Instruction *, 8> ScalarPtrs; 4961 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4962 auto *Latch = TheLoop->getLoopLatch(); 4963 4964 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4965 // The pointer operands of loads and stores will be scalar as long as the 4966 // memory access is not a gather or scatter operation. The value operand of a 4967 // store will remain scalar if the store is scalarized. 4968 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4969 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4970 assert(WideningDecision != CM_Unknown && 4971 "Widening decision should be ready at this moment"); 4972 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4973 if (Ptr == Store->getValueOperand()) 4974 return WideningDecision == CM_Scalarize; 4975 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4976 "Ptr is neither a value or pointer operand"); 4977 return WideningDecision != CM_GatherScatter; 4978 }; 4979 4980 // A helper that returns true if the given value is a bitcast or 4981 // getelementptr instruction contained in the loop. 4982 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4983 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4984 isa<GetElementPtrInst>(V)) && 4985 !TheLoop->isLoopInvariant(V); 4986 }; 4987 4988 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4989 if (!isa<PHINode>(Ptr) || 4990 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4991 return false; 4992 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4993 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4994 return false; 4995 return isScalarUse(MemAccess, Ptr); 4996 }; 4997 4998 // A helper that evaluates a memory access's use of a pointer. If the 4999 // pointer is actually the pointer induction of a loop, it is being 5000 // inserted into Worklist. If the use will be a scalar use, and the 5001 // pointer is only used by memory accesses, we place the pointer in 5002 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5003 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5004 if (isScalarPtrInduction(MemAccess, Ptr)) { 5005 Worklist.insert(cast<Instruction>(Ptr)); 5006 Instruction *Update = cast<Instruction>( 5007 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5008 Worklist.insert(Update); 5009 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5010 << "\n"); 5011 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5012 << "\n"); 5013 return; 5014 } 5015 // We only care about bitcast and getelementptr instructions contained in 5016 // the loop. 5017 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5018 return; 5019 5020 // If the pointer has already been identified as scalar (e.g., if it was 5021 // also identified as uniform), there's nothing to do. 5022 auto *I = cast<Instruction>(Ptr); 5023 if (Worklist.count(I)) 5024 return; 5025 5026 // If the use of the pointer will be a scalar use, and all users of the 5027 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5028 // place the pointer in PossibleNonScalarPtrs. 5029 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5030 return isa<LoadInst>(U) || isa<StoreInst>(U); 5031 })) 5032 ScalarPtrs.insert(I); 5033 else 5034 PossibleNonScalarPtrs.insert(I); 5035 }; 5036 5037 // We seed the scalars analysis with three classes of instructions: (1) 5038 // instructions marked uniform-after-vectorization and (2) bitcast, 5039 // getelementptr and (pointer) phi instructions used by memory accesses 5040 // requiring a scalar use. 5041 // 5042 // (1) Add to the worklist all instructions that have been identified as 5043 // uniform-after-vectorization. 5044 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5045 5046 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5047 // memory accesses requiring a scalar use. The pointer operands of loads and 5048 // stores will be scalar as long as the memory accesses is not a gather or 5049 // scatter operation. The value operand of a store will remain scalar if the 5050 // store is scalarized. 5051 for (auto *BB : TheLoop->blocks()) 5052 for (auto &I : *BB) { 5053 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5054 evaluatePtrUse(Load, Load->getPointerOperand()); 5055 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5056 evaluatePtrUse(Store, Store->getPointerOperand()); 5057 evaluatePtrUse(Store, Store->getValueOperand()); 5058 } 5059 } 5060 for (auto *I : ScalarPtrs) 5061 if (!PossibleNonScalarPtrs.count(I)) { 5062 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5063 Worklist.insert(I); 5064 } 5065 5066 // Insert the forced scalars. 5067 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5068 // induction variable when the PHI user is scalarized. 5069 auto ForcedScalar = ForcedScalars.find(VF); 5070 if (ForcedScalar != ForcedScalars.end()) 5071 for (auto *I : ForcedScalar->second) 5072 Worklist.insert(I); 5073 5074 // Expand the worklist by looking through any bitcasts and getelementptr 5075 // instructions we've already identified as scalar. This is similar to the 5076 // expansion step in collectLoopUniforms(); however, here we're only 5077 // expanding to include additional bitcasts and getelementptr instructions. 5078 unsigned Idx = 0; 5079 while (Idx != Worklist.size()) { 5080 Instruction *Dst = Worklist[Idx++]; 5081 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5082 continue; 5083 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5084 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5085 auto *J = cast<Instruction>(U); 5086 return !TheLoop->contains(J) || Worklist.count(J) || 5087 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5088 isScalarUse(J, Src)); 5089 })) { 5090 Worklist.insert(Src); 5091 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5092 } 5093 } 5094 5095 // An induction variable will remain scalar if all users of the induction 5096 // variable and induction variable update remain scalar. 5097 for (auto &Induction : Legal->getInductionVars()) { 5098 auto *Ind = Induction.first; 5099 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5100 5101 // If tail-folding is applied, the primary induction variable will be used 5102 // to feed a vector compare. 5103 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5104 continue; 5105 5106 // Determine if all users of the induction variable are scalar after 5107 // vectorization. 5108 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5109 auto *I = cast<Instruction>(U); 5110 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5111 }); 5112 if (!ScalarInd) 5113 continue; 5114 5115 // Determine if all users of the induction variable update instruction are 5116 // scalar after vectorization. 5117 auto ScalarIndUpdate = 5118 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5119 auto *I = cast<Instruction>(U); 5120 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5121 }); 5122 if (!ScalarIndUpdate) 5123 continue; 5124 5125 // The induction variable and its update instruction will remain scalar. 5126 Worklist.insert(Ind); 5127 Worklist.insert(IndUpdate); 5128 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5129 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5130 << "\n"); 5131 } 5132 5133 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5134 } 5135 5136 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5137 ElementCount VF) { 5138 if (!blockNeedsPredication(I->getParent())) 5139 return false; 5140 switch(I->getOpcode()) { 5141 default: 5142 break; 5143 case Instruction::Load: 5144 case Instruction::Store: { 5145 if (!Legal->isMaskRequired(I)) 5146 return false; 5147 auto *Ptr = getLoadStorePointerOperand(I); 5148 auto *Ty = getMemInstValueType(I); 5149 // We have already decided how to vectorize this instruction, get that 5150 // result. 5151 if (VF.isVector()) { 5152 InstWidening WideningDecision = getWideningDecision(I, VF); 5153 assert(WideningDecision != CM_Unknown && 5154 "Widening decision should be ready at this moment"); 5155 return WideningDecision == CM_Scalarize; 5156 } 5157 const Align Alignment = getLoadStoreAlignment(I); 5158 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5159 isLegalMaskedGather(Ty, Alignment)) 5160 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5161 isLegalMaskedScatter(Ty, Alignment)); 5162 } 5163 case Instruction::UDiv: 5164 case Instruction::SDiv: 5165 case Instruction::SRem: 5166 case Instruction::URem: 5167 return mayDivideByZero(*I); 5168 } 5169 return false; 5170 } 5171 5172 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5173 Instruction *I, ElementCount VF) { 5174 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5175 assert(getWideningDecision(I, VF) == CM_Unknown && 5176 "Decision should not be set yet."); 5177 auto *Group = getInterleavedAccessGroup(I); 5178 assert(Group && "Must have a group."); 5179 5180 // If the instruction's allocated size doesn't equal it's type size, it 5181 // requires padding and will be scalarized. 5182 auto &DL = I->getModule()->getDataLayout(); 5183 auto *ScalarTy = getMemInstValueType(I); 5184 if (hasIrregularType(ScalarTy, DL, VF)) 5185 return false; 5186 5187 // Check if masking is required. 5188 // A Group may need masking for one of two reasons: it resides in a block that 5189 // needs predication, or it was decided to use masking to deal with gaps. 5190 bool PredicatedAccessRequiresMasking = 5191 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5192 bool AccessWithGapsRequiresMasking = 5193 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5194 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5195 return true; 5196 5197 // If masked interleaving is required, we expect that the user/target had 5198 // enabled it, because otherwise it either wouldn't have been created or 5199 // it should have been invalidated by the CostModel. 5200 assert(useMaskedInterleavedAccesses(TTI) && 5201 "Masked interleave-groups for predicated accesses are not enabled."); 5202 5203 auto *Ty = getMemInstValueType(I); 5204 const Align Alignment = getLoadStoreAlignment(I); 5205 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5206 : TTI.isLegalMaskedStore(Ty, Alignment); 5207 } 5208 5209 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5210 Instruction *I, ElementCount VF) { 5211 // Get and ensure we have a valid memory instruction. 5212 LoadInst *LI = dyn_cast<LoadInst>(I); 5213 StoreInst *SI = dyn_cast<StoreInst>(I); 5214 assert((LI || SI) && "Invalid memory instruction"); 5215 5216 auto *Ptr = getLoadStorePointerOperand(I); 5217 5218 // In order to be widened, the pointer should be consecutive, first of all. 5219 if (!Legal->isConsecutivePtr(Ptr)) 5220 return false; 5221 5222 // If the instruction is a store located in a predicated block, it will be 5223 // scalarized. 5224 if (isScalarWithPredication(I)) 5225 return false; 5226 5227 // If the instruction's allocated size doesn't equal it's type size, it 5228 // requires padding and will be scalarized. 5229 auto &DL = I->getModule()->getDataLayout(); 5230 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5231 if (hasIrregularType(ScalarTy, DL, VF)) 5232 return false; 5233 5234 return true; 5235 } 5236 5237 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5238 // We should not collect Uniforms more than once per VF. Right now, 5239 // this function is called from collectUniformsAndScalars(), which 5240 // already does this check. Collecting Uniforms for VF=1 does not make any 5241 // sense. 5242 5243 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5244 "This function should not be visited twice for the same VF"); 5245 5246 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5247 // not analyze again. Uniforms.count(VF) will return 1. 5248 Uniforms[VF].clear(); 5249 5250 // We now know that the loop is vectorizable! 5251 // Collect instructions inside the loop that will remain uniform after 5252 // vectorization. 5253 5254 // Global values, params and instructions outside of current loop are out of 5255 // scope. 5256 auto isOutOfScope = [&](Value *V) -> bool { 5257 Instruction *I = dyn_cast<Instruction>(V); 5258 return (!I || !TheLoop->contains(I)); 5259 }; 5260 5261 SetVector<Instruction *> Worklist; 5262 BasicBlock *Latch = TheLoop->getLoopLatch(); 5263 5264 // Instructions that are scalar with predication must not be considered 5265 // uniform after vectorization, because that would create an erroneous 5266 // replicating region where only a single instance out of VF should be formed. 5267 // TODO: optimize such seldom cases if found important, see PR40816. 5268 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5269 if (isOutOfScope(I)) { 5270 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5271 << *I << "\n"); 5272 return; 5273 } 5274 if (isScalarWithPredication(I, VF)) { 5275 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5276 << *I << "\n"); 5277 return; 5278 } 5279 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5280 Worklist.insert(I); 5281 }; 5282 5283 // Start with the conditional branch. If the branch condition is an 5284 // instruction contained in the loop that is only used by the branch, it is 5285 // uniform. 5286 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5287 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5288 addToWorklistIfAllowed(Cmp); 5289 5290 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5291 InstWidening WideningDecision = getWideningDecision(I, VF); 5292 assert(WideningDecision != CM_Unknown && 5293 "Widening decision should be ready at this moment"); 5294 5295 // A uniform memory op is itself uniform. We exclude uniform stores 5296 // here as they demand the last lane, not the first one. 5297 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5298 assert(WideningDecision == CM_Scalarize); 5299 return true; 5300 } 5301 5302 return (WideningDecision == CM_Widen || 5303 WideningDecision == CM_Widen_Reverse || 5304 WideningDecision == CM_Interleave); 5305 }; 5306 5307 5308 // Returns true if Ptr is the pointer operand of a memory access instruction 5309 // I, and I is known to not require scalarization. 5310 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5311 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5312 }; 5313 5314 // Holds a list of values which are known to have at least one uniform use. 5315 // Note that there may be other uses which aren't uniform. A "uniform use" 5316 // here is something which only demands lane 0 of the unrolled iterations; 5317 // it does not imply that all lanes produce the same value (e.g. this is not 5318 // the usual meaning of uniform) 5319 SmallPtrSet<Value *, 8> HasUniformUse; 5320 5321 // Scan the loop for instructions which are either a) known to have only 5322 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5323 for (auto *BB : TheLoop->blocks()) 5324 for (auto &I : *BB) { 5325 // If there's no pointer operand, there's nothing to do. 5326 auto *Ptr = getLoadStorePointerOperand(&I); 5327 if (!Ptr) 5328 continue; 5329 5330 // A uniform memory op is itself uniform. We exclude uniform stores 5331 // here as they demand the last lane, not the first one. 5332 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5333 addToWorklistIfAllowed(&I); 5334 5335 if (isUniformDecision(&I, VF)) { 5336 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5337 HasUniformUse.insert(Ptr); 5338 } 5339 } 5340 5341 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5342 // demanding) users. Since loops are assumed to be in LCSSA form, this 5343 // disallows uses outside the loop as well. 5344 for (auto *V : HasUniformUse) { 5345 if (isOutOfScope(V)) 5346 continue; 5347 auto *I = cast<Instruction>(V); 5348 auto UsersAreMemAccesses = 5349 llvm::all_of(I->users(), [&](User *U) -> bool { 5350 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5351 }); 5352 if (UsersAreMemAccesses) 5353 addToWorklistIfAllowed(I); 5354 } 5355 5356 // Expand Worklist in topological order: whenever a new instruction 5357 // is added , its users should be already inside Worklist. It ensures 5358 // a uniform instruction will only be used by uniform instructions. 5359 unsigned idx = 0; 5360 while (idx != Worklist.size()) { 5361 Instruction *I = Worklist[idx++]; 5362 5363 for (auto OV : I->operand_values()) { 5364 // isOutOfScope operands cannot be uniform instructions. 5365 if (isOutOfScope(OV)) 5366 continue; 5367 // First order recurrence Phi's should typically be considered 5368 // non-uniform. 5369 auto *OP = dyn_cast<PHINode>(OV); 5370 if (OP && Legal->isFirstOrderRecurrence(OP)) 5371 continue; 5372 // If all the users of the operand are uniform, then add the 5373 // operand into the uniform worklist. 5374 auto *OI = cast<Instruction>(OV); 5375 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5376 auto *J = cast<Instruction>(U); 5377 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5378 })) 5379 addToWorklistIfAllowed(OI); 5380 } 5381 } 5382 5383 // For an instruction to be added into Worklist above, all its users inside 5384 // the loop should also be in Worklist. However, this condition cannot be 5385 // true for phi nodes that form a cyclic dependence. We must process phi 5386 // nodes separately. An induction variable will remain uniform if all users 5387 // of the induction variable and induction variable update remain uniform. 5388 // The code below handles both pointer and non-pointer induction variables. 5389 for (auto &Induction : Legal->getInductionVars()) { 5390 auto *Ind = Induction.first; 5391 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5392 5393 // Determine if all users of the induction variable are uniform after 5394 // vectorization. 5395 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5396 auto *I = cast<Instruction>(U); 5397 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5398 isVectorizedMemAccessUse(I, Ind); 5399 }); 5400 if (!UniformInd) 5401 continue; 5402 5403 // Determine if all users of the induction variable update instruction are 5404 // uniform after vectorization. 5405 auto UniformIndUpdate = 5406 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5407 auto *I = cast<Instruction>(U); 5408 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5409 isVectorizedMemAccessUse(I, IndUpdate); 5410 }); 5411 if (!UniformIndUpdate) 5412 continue; 5413 5414 // The induction variable and its update instruction will remain uniform. 5415 addToWorklistIfAllowed(Ind); 5416 addToWorklistIfAllowed(IndUpdate); 5417 } 5418 5419 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5420 } 5421 5422 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5423 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5424 5425 if (Legal->getRuntimePointerChecking()->Need) { 5426 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5427 "runtime pointer checks needed. Enable vectorization of this " 5428 "loop with '#pragma clang loop vectorize(enable)' when " 5429 "compiling with -Os/-Oz", 5430 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5431 return true; 5432 } 5433 5434 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5435 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5436 "runtime SCEV checks needed. Enable vectorization of this " 5437 "loop with '#pragma clang loop vectorize(enable)' when " 5438 "compiling with -Os/-Oz", 5439 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5440 return true; 5441 } 5442 5443 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5444 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5445 reportVectorizationFailure("Runtime stride check for small trip count", 5446 "runtime stride == 1 checks needed. Enable vectorization of " 5447 "this loop without such check by compiling with -Os/-Oz", 5448 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5449 return true; 5450 } 5451 5452 return false; 5453 } 5454 5455 Optional<ElementCount> 5456 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5457 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5458 // TODO: It may by useful to do since it's still likely to be dynamically 5459 // uniform if the target can skip. 5460 reportVectorizationFailure( 5461 "Not inserting runtime ptr check for divergent target", 5462 "runtime pointer checks needed. Not enabled for divergent target", 5463 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5464 return None; 5465 } 5466 5467 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5468 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5469 if (TC == 1) { 5470 reportVectorizationFailure("Single iteration (non) loop", 5471 "loop trip count is one, irrelevant for vectorization", 5472 "SingleIterationLoop", ORE, TheLoop); 5473 return None; 5474 } 5475 5476 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5477 5478 switch (ScalarEpilogueStatus) { 5479 case CM_ScalarEpilogueAllowed: 5480 return MaxVF; 5481 case CM_ScalarEpilogueNotAllowedUsePredicate: 5482 LLVM_FALLTHROUGH; 5483 case CM_ScalarEpilogueNotNeededUsePredicate: 5484 LLVM_DEBUG( 5485 dbgs() << "LV: vector predicate hint/switch found.\n" 5486 << "LV: Not allowing scalar epilogue, creating predicated " 5487 << "vector loop.\n"); 5488 break; 5489 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5490 // fallthrough as a special case of OptForSize 5491 case CM_ScalarEpilogueNotAllowedOptSize: 5492 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5493 LLVM_DEBUG( 5494 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5495 else 5496 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5497 << "count.\n"); 5498 5499 // Bail if runtime checks are required, which are not good when optimising 5500 // for size. 5501 if (runtimeChecksRequired()) 5502 return None; 5503 5504 break; 5505 } 5506 5507 // The only loops we can vectorize without a scalar epilogue, are loops with 5508 // a bottom-test and a single exiting block. We'd have to handle the fact 5509 // that not every instruction executes on the last iteration. This will 5510 // require a lane mask which varies through the vector loop body. (TODO) 5511 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5512 // If there was a tail-folding hint/switch, but we can't fold the tail by 5513 // masking, fallback to a vectorization with a scalar epilogue. 5514 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5515 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5516 "scalar epilogue instead.\n"); 5517 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5518 return MaxVF; 5519 } 5520 return None; 5521 } 5522 5523 // Now try the tail folding 5524 5525 // Invalidate interleave groups that require an epilogue if we can't mask 5526 // the interleave-group. 5527 if (!useMaskedInterleavedAccesses(TTI)) { 5528 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5529 "No decisions should have been taken at this point"); 5530 // Note: There is no need to invalidate any cost modeling decisions here, as 5531 // non where taken so far. 5532 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5533 } 5534 5535 assert(!MaxVF.isScalable() && 5536 "Scalable vectors do not yet support tail folding"); 5537 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5538 "MaxVF must be a power of 2"); 5539 unsigned MaxVFtimesIC = 5540 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5541 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5542 // chose. 5543 ScalarEvolution *SE = PSE.getSE(); 5544 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5545 const SCEV *ExitCount = SE->getAddExpr( 5546 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5547 const SCEV *Rem = SE->getURemExpr( 5548 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5549 if (Rem->isZero()) { 5550 // Accept MaxVF if we do not have a tail. 5551 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5552 return MaxVF; 5553 } 5554 5555 // If we don't know the precise trip count, or if the trip count that we 5556 // found modulo the vectorization factor is not zero, try to fold the tail 5557 // by masking. 5558 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5559 if (Legal->prepareToFoldTailByMasking()) { 5560 FoldTailByMasking = true; 5561 return MaxVF; 5562 } 5563 5564 // If there was a tail-folding hint/switch, but we can't fold the tail by 5565 // masking, fallback to a vectorization with a scalar epilogue. 5566 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5567 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5568 "scalar epilogue instead.\n"); 5569 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5570 return MaxVF; 5571 } 5572 5573 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5574 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5575 return None; 5576 } 5577 5578 if (TC == 0) { 5579 reportVectorizationFailure( 5580 "Unable to calculate the loop count due to complex control flow", 5581 "unable to calculate the loop count due to complex control flow", 5582 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5583 return None; 5584 } 5585 5586 reportVectorizationFailure( 5587 "Cannot optimize for size and vectorize at the same time.", 5588 "cannot optimize for size and vectorize at the same time. " 5589 "Enable vectorization of this loop with '#pragma clang loop " 5590 "vectorize(enable)' when compiling with -Os/-Oz", 5591 "NoTailLoopWithOptForSize", ORE, TheLoop); 5592 return None; 5593 } 5594 5595 ElementCount 5596 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5597 ElementCount UserVF) { 5598 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5599 unsigned SmallestType, WidestType; 5600 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5601 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5602 5603 // Get the maximum safe dependence distance in bits computed by LAA. 5604 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5605 // the memory accesses that is most restrictive (involved in the smallest 5606 // dependence distance). 5607 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5608 5609 if (UserVF.isNonZero()) { 5610 // For now, don't verify legality of scalable vectors. 5611 // This will be addressed properly in https://reviews.llvm.org/D91718. 5612 if (UserVF.isScalable()) 5613 return UserVF; 5614 5615 // If legally unsafe, clamp the user vectorization factor to a safe value. 5616 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5617 if (UserVF.getFixedValue() <= MaxSafeVF) 5618 return UserVF; 5619 5620 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5621 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5622 << ".\n"); 5623 ORE->emit([&]() { 5624 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5625 TheLoop->getStartLoc(), 5626 TheLoop->getHeader()) 5627 << "User-specified vectorization factor " 5628 << ore::NV("UserVectorizationFactor", UserVF) 5629 << " is unsafe, clamping to maximum safe vectorization factor " 5630 << ore::NV("VectorizationFactor", MaxSafeVF); 5631 }); 5632 return ElementCount::getFixed(MaxSafeVF); 5633 } 5634 5635 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5636 5637 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5638 // Note that both WidestRegister and WidestType may not be a powers of 2. 5639 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5640 5641 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5642 << " / " << WidestType << " bits.\n"); 5643 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5644 << WidestRegister << " bits.\n"); 5645 5646 assert(MaxVectorSize <= WidestRegister && 5647 "Did not expect to pack so many elements" 5648 " into one vector!"); 5649 if (MaxVectorSize == 0) { 5650 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5651 MaxVectorSize = 1; 5652 return ElementCount::getFixed(MaxVectorSize); 5653 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5654 isPowerOf2_32(ConstTripCount)) { 5655 // We need to clamp the VF to be the ConstTripCount. There is no point in 5656 // choosing a higher viable VF as done in the loop below. 5657 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5658 << ConstTripCount << "\n"); 5659 MaxVectorSize = ConstTripCount; 5660 return ElementCount::getFixed(MaxVectorSize); 5661 } 5662 5663 unsigned MaxVF = MaxVectorSize; 5664 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5665 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5666 // Collect all viable vectorization factors larger than the default MaxVF 5667 // (i.e. MaxVectorSize). 5668 SmallVector<ElementCount, 8> VFs; 5669 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5670 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5671 VFs.push_back(ElementCount::getFixed(VS)); 5672 5673 // For each VF calculate its register usage. 5674 auto RUs = calculateRegisterUsage(VFs); 5675 5676 // Select the largest VF which doesn't require more registers than existing 5677 // ones. 5678 for (int i = RUs.size() - 1; i >= 0; --i) { 5679 bool Selected = true; 5680 for (auto& pair : RUs[i].MaxLocalUsers) { 5681 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5682 if (pair.second > TargetNumRegisters) 5683 Selected = false; 5684 } 5685 if (Selected) { 5686 MaxVF = VFs[i].getKnownMinValue(); 5687 break; 5688 } 5689 } 5690 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5691 if (MaxVF < MinVF) { 5692 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5693 << ") with target's minimum: " << MinVF << '\n'); 5694 MaxVF = MinVF; 5695 } 5696 } 5697 } 5698 return ElementCount::getFixed(MaxVF); 5699 } 5700 5701 VectorizationFactor 5702 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5703 // FIXME: This can be fixed for scalable vectors later, because at this stage 5704 // the LoopVectorizer will only consider vectorizing a loop with scalable 5705 // vectors when the loop has a hint to enable vectorization for a given VF. 5706 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5707 5708 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5709 const float ScalarCost = Cost; 5710 unsigned Width = 1; 5711 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5712 5713 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5714 if (ForceVectorization && MaxVF.isVector()) { 5715 // Ignore scalar width, because the user explicitly wants vectorization. 5716 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5717 // evaluation. 5718 Cost = std::numeric_limits<float>::max(); 5719 } 5720 5721 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5722 // Notice that the vector loop needs to be executed less times, so 5723 // we need to divide the cost of the vector loops by the width of 5724 // the vector elements. 5725 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5726 float VectorCost = C.first / (float)i; 5727 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5728 << " costs: " << (int)VectorCost << ".\n"); 5729 if (!C.second && !ForceVectorization) { 5730 LLVM_DEBUG( 5731 dbgs() << "LV: Not considering vector loop of width " << i 5732 << " because it will not generate any vector instructions.\n"); 5733 continue; 5734 } 5735 5736 // If profitable add it to ProfitableVF list. 5737 if (VectorCost < ScalarCost) { 5738 ProfitableVFs.push_back(VectorizationFactor( 5739 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5740 } 5741 5742 if (VectorCost < Cost) { 5743 Cost = VectorCost; 5744 Width = i; 5745 } 5746 } 5747 5748 if (!EnableCondStoresVectorization && NumPredStores) { 5749 reportVectorizationFailure("There are conditional stores.", 5750 "store that is conditionally executed prevents vectorization", 5751 "ConditionalStore", ORE, TheLoop); 5752 Width = 1; 5753 Cost = ScalarCost; 5754 } 5755 5756 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5757 << "LV: Vectorization seems to be not beneficial, " 5758 << "but was forced by a user.\n"); 5759 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5760 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5761 (unsigned)(Width * Cost)}; 5762 return Factor; 5763 } 5764 5765 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5766 const Loop &L, ElementCount VF) const { 5767 // Cross iteration phis such as reductions need special handling and are 5768 // currently unsupported. 5769 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5770 return Legal->isFirstOrderRecurrence(&Phi) || 5771 Legal->isReductionVariable(&Phi); 5772 })) 5773 return false; 5774 5775 // Phis with uses outside of the loop require special handling and are 5776 // currently unsupported. 5777 for (auto &Entry : Legal->getInductionVars()) { 5778 // Look for uses of the value of the induction at the last iteration. 5779 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5780 for (User *U : PostInc->users()) 5781 if (!L.contains(cast<Instruction>(U))) 5782 return false; 5783 // Look for uses of penultimate value of the induction. 5784 for (User *U : Entry.first->users()) 5785 if (!L.contains(cast<Instruction>(U))) 5786 return false; 5787 } 5788 5789 // Induction variables that are widened require special handling that is 5790 // currently not supported. 5791 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5792 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5793 this->isProfitableToScalarize(Entry.first, VF)); 5794 })) 5795 return false; 5796 5797 return true; 5798 } 5799 5800 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5801 const ElementCount VF) const { 5802 // FIXME: We need a much better cost-model to take different parameters such 5803 // as register pressure, code size increase and cost of extra branches into 5804 // account. For now we apply a very crude heuristic and only consider loops 5805 // with vectorization factors larger than a certain value. 5806 // We also consider epilogue vectorization unprofitable for targets that don't 5807 // consider interleaving beneficial (eg. MVE). 5808 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5809 return false; 5810 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5811 return true; 5812 return false; 5813 } 5814 5815 VectorizationFactor 5816 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5817 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5818 VectorizationFactor Result = VectorizationFactor::Disabled(); 5819 if (!EnableEpilogueVectorization) { 5820 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5821 return Result; 5822 } 5823 5824 if (!isScalarEpilogueAllowed()) { 5825 LLVM_DEBUG( 5826 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5827 "allowed.\n";); 5828 return Result; 5829 } 5830 5831 // FIXME: This can be fixed for scalable vectors later, because at this stage 5832 // the LoopVectorizer will only consider vectorizing a loop with scalable 5833 // vectors when the loop has a hint to enable vectorization for a given VF. 5834 if (MainLoopVF.isScalable()) { 5835 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5836 "yet supported.\n"); 5837 return Result; 5838 } 5839 5840 // Not really a cost consideration, but check for unsupported cases here to 5841 // simplify the logic. 5842 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5843 LLVM_DEBUG( 5844 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5845 "not a supported candidate.\n";); 5846 return Result; 5847 } 5848 5849 if (EpilogueVectorizationForceVF > 1) { 5850 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5851 if (LVP.hasPlanWithVFs( 5852 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5853 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5854 else { 5855 LLVM_DEBUG( 5856 dbgs() 5857 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5858 return Result; 5859 } 5860 } 5861 5862 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5863 TheLoop->getHeader()->getParent()->hasMinSize()) { 5864 LLVM_DEBUG( 5865 dbgs() 5866 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5867 return Result; 5868 } 5869 5870 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5871 return Result; 5872 5873 for (auto &NextVF : ProfitableVFs) 5874 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5875 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5876 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5877 Result = NextVF; 5878 5879 if (Result != VectorizationFactor::Disabled()) 5880 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5881 << Result.Width.getFixedValue() << "\n";); 5882 return Result; 5883 } 5884 5885 std::pair<unsigned, unsigned> 5886 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5887 unsigned MinWidth = -1U; 5888 unsigned MaxWidth = 8; 5889 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5890 5891 // For each block. 5892 for (BasicBlock *BB : TheLoop->blocks()) { 5893 // For each instruction in the loop. 5894 for (Instruction &I : BB->instructionsWithoutDebug()) { 5895 Type *T = I.getType(); 5896 5897 // Skip ignored values. 5898 if (ValuesToIgnore.count(&I)) 5899 continue; 5900 5901 // Only examine Loads, Stores and PHINodes. 5902 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5903 continue; 5904 5905 // Examine PHI nodes that are reduction variables. Update the type to 5906 // account for the recurrence type. 5907 if (auto *PN = dyn_cast<PHINode>(&I)) { 5908 if (!Legal->isReductionVariable(PN)) 5909 continue; 5910 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5911 T = RdxDesc.getRecurrenceType(); 5912 } 5913 5914 // Examine the stored values. 5915 if (auto *ST = dyn_cast<StoreInst>(&I)) 5916 T = ST->getValueOperand()->getType(); 5917 5918 // Ignore loaded pointer types and stored pointer types that are not 5919 // vectorizable. 5920 // 5921 // FIXME: The check here attempts to predict whether a load or store will 5922 // be vectorized. We only know this for certain after a VF has 5923 // been selected. Here, we assume that if an access can be 5924 // vectorized, it will be. We should also look at extending this 5925 // optimization to non-pointer types. 5926 // 5927 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5928 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5929 continue; 5930 5931 MinWidth = std::min(MinWidth, 5932 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5933 MaxWidth = std::max(MaxWidth, 5934 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5935 } 5936 } 5937 5938 return {MinWidth, MaxWidth}; 5939 } 5940 5941 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5942 unsigned LoopCost) { 5943 // -- The interleave heuristics -- 5944 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5945 // There are many micro-architectural considerations that we can't predict 5946 // at this level. For example, frontend pressure (on decode or fetch) due to 5947 // code size, or the number and capabilities of the execution ports. 5948 // 5949 // We use the following heuristics to select the interleave count: 5950 // 1. If the code has reductions, then we interleave to break the cross 5951 // iteration dependency. 5952 // 2. If the loop is really small, then we interleave to reduce the loop 5953 // overhead. 5954 // 3. We don't interleave if we think that we will spill registers to memory 5955 // due to the increased register pressure. 5956 5957 if (!isScalarEpilogueAllowed()) 5958 return 1; 5959 5960 // We used the distance for the interleave count. 5961 if (Legal->getMaxSafeDepDistBytes() != -1U) 5962 return 1; 5963 5964 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5965 const bool HasReductions = !Legal->getReductionVars().empty(); 5966 // Do not interleave loops with a relatively small known or estimated trip 5967 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5968 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5969 // because with the above conditions interleaving can expose ILP and break 5970 // cross iteration dependences for reductions. 5971 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5972 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5973 return 1; 5974 5975 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5976 // We divide by these constants so assume that we have at least one 5977 // instruction that uses at least one register. 5978 for (auto& pair : R.MaxLocalUsers) { 5979 pair.second = std::max(pair.second, 1U); 5980 } 5981 5982 // We calculate the interleave count using the following formula. 5983 // Subtract the number of loop invariants from the number of available 5984 // registers. These registers are used by all of the interleaved instances. 5985 // Next, divide the remaining registers by the number of registers that is 5986 // required by the loop, in order to estimate how many parallel instances 5987 // fit without causing spills. All of this is rounded down if necessary to be 5988 // a power of two. We want power of two interleave count to simplify any 5989 // addressing operations or alignment considerations. 5990 // We also want power of two interleave counts to ensure that the induction 5991 // variable of the vector loop wraps to zero, when tail is folded by masking; 5992 // this currently happens when OptForSize, in which case IC is set to 1 above. 5993 unsigned IC = UINT_MAX; 5994 5995 for (auto& pair : R.MaxLocalUsers) { 5996 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5997 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5998 << " registers of " 5999 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6000 if (VF.isScalar()) { 6001 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6002 TargetNumRegisters = ForceTargetNumScalarRegs; 6003 } else { 6004 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6005 TargetNumRegisters = ForceTargetNumVectorRegs; 6006 } 6007 unsigned MaxLocalUsers = pair.second; 6008 unsigned LoopInvariantRegs = 0; 6009 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6010 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6011 6012 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6013 // Don't count the induction variable as interleaved. 6014 if (EnableIndVarRegisterHeur) { 6015 TmpIC = 6016 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6017 std::max(1U, (MaxLocalUsers - 1))); 6018 } 6019 6020 IC = std::min(IC, TmpIC); 6021 } 6022 6023 // Clamp the interleave ranges to reasonable counts. 6024 unsigned MaxInterleaveCount = 6025 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6026 6027 // Check if the user has overridden the max. 6028 if (VF.isScalar()) { 6029 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6030 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6031 } else { 6032 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6033 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6034 } 6035 6036 // If trip count is known or estimated compile time constant, limit the 6037 // interleave count to be less than the trip count divided by VF, provided it 6038 // is at least 1. 6039 // 6040 // For scalable vectors we can't know if interleaving is beneficial. It may 6041 // not be beneficial for small loops if none of the lanes in the second vector 6042 // iterations is enabled. However, for larger loops, there is likely to be a 6043 // similar benefit as for fixed-width vectors. For now, we choose to leave 6044 // the InterleaveCount as if vscale is '1', although if some information about 6045 // the vector is known (e.g. min vector size), we can make a better decision. 6046 if (BestKnownTC) { 6047 MaxInterleaveCount = 6048 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6049 // Make sure MaxInterleaveCount is greater than 0. 6050 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6051 } 6052 6053 assert(MaxInterleaveCount > 0 && 6054 "Maximum interleave count must be greater than 0"); 6055 6056 // Clamp the calculated IC to be between the 1 and the max interleave count 6057 // that the target and trip count allows. 6058 if (IC > MaxInterleaveCount) 6059 IC = MaxInterleaveCount; 6060 else 6061 // Make sure IC is greater than 0. 6062 IC = std::max(1u, IC); 6063 6064 assert(IC > 0 && "Interleave count must be greater than 0."); 6065 6066 // If we did not calculate the cost for VF (because the user selected the VF) 6067 // then we calculate the cost of VF here. 6068 if (LoopCost == 0) 6069 LoopCost = expectedCost(VF).first; 6070 6071 assert(LoopCost && "Non-zero loop cost expected"); 6072 6073 // Interleave if we vectorized this loop and there is a reduction that could 6074 // benefit from interleaving. 6075 if (VF.isVector() && HasReductions) { 6076 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6077 return IC; 6078 } 6079 6080 // Note that if we've already vectorized the loop we will have done the 6081 // runtime check and so interleaving won't require further checks. 6082 bool InterleavingRequiresRuntimePointerCheck = 6083 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6084 6085 // We want to interleave small loops in order to reduce the loop overhead and 6086 // potentially expose ILP opportunities. 6087 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6088 << "LV: IC is " << IC << '\n' 6089 << "LV: VF is " << VF << '\n'); 6090 const bool AggressivelyInterleaveReductions = 6091 TTI.enableAggressiveInterleaving(HasReductions); 6092 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6093 // We assume that the cost overhead is 1 and we use the cost model 6094 // to estimate the cost of the loop and interleave until the cost of the 6095 // loop overhead is about 5% of the cost of the loop. 6096 unsigned SmallIC = 6097 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6098 6099 // Interleave until store/load ports (estimated by max interleave count) are 6100 // saturated. 6101 unsigned NumStores = Legal->getNumStores(); 6102 unsigned NumLoads = Legal->getNumLoads(); 6103 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6104 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6105 6106 // If we have a scalar reduction (vector reductions are already dealt with 6107 // by this point), we can increase the critical path length if the loop 6108 // we're interleaving is inside another loop. Limit, by default to 2, so the 6109 // critical path only gets increased by one reduction operation. 6110 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6111 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6112 SmallIC = std::min(SmallIC, F); 6113 StoresIC = std::min(StoresIC, F); 6114 LoadsIC = std::min(LoadsIC, F); 6115 } 6116 6117 if (EnableLoadStoreRuntimeInterleave && 6118 std::max(StoresIC, LoadsIC) > SmallIC) { 6119 LLVM_DEBUG( 6120 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6121 return std::max(StoresIC, LoadsIC); 6122 } 6123 6124 // If there are scalar reductions and TTI has enabled aggressive 6125 // interleaving for reductions, we will interleave to expose ILP. 6126 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6127 AggressivelyInterleaveReductions) { 6128 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6129 // Interleave no less than SmallIC but not as aggressive as the normal IC 6130 // to satisfy the rare situation when resources are too limited. 6131 return std::max(IC / 2, SmallIC); 6132 } else { 6133 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6134 return SmallIC; 6135 } 6136 } 6137 6138 // Interleave if this is a large loop (small loops are already dealt with by 6139 // this point) that could benefit from interleaving. 6140 if (AggressivelyInterleaveReductions) { 6141 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6142 return IC; 6143 } 6144 6145 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6146 return 1; 6147 } 6148 6149 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6150 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6151 // This function calculates the register usage by measuring the highest number 6152 // of values that are alive at a single location. Obviously, this is a very 6153 // rough estimation. We scan the loop in a topological order in order and 6154 // assign a number to each instruction. We use RPO to ensure that defs are 6155 // met before their users. We assume that each instruction that has in-loop 6156 // users starts an interval. We record every time that an in-loop value is 6157 // used, so we have a list of the first and last occurrences of each 6158 // instruction. Next, we transpose this data structure into a multi map that 6159 // holds the list of intervals that *end* at a specific location. This multi 6160 // map allows us to perform a linear search. We scan the instructions linearly 6161 // and record each time that a new interval starts, by placing it in a set. 6162 // If we find this value in the multi-map then we remove it from the set. 6163 // The max register usage is the maximum size of the set. 6164 // We also search for instructions that are defined outside the loop, but are 6165 // used inside the loop. We need this number separately from the max-interval 6166 // usage number because when we unroll, loop-invariant values do not take 6167 // more register. 6168 LoopBlocksDFS DFS(TheLoop); 6169 DFS.perform(LI); 6170 6171 RegisterUsage RU; 6172 6173 // Each 'key' in the map opens a new interval. The values 6174 // of the map are the index of the 'last seen' usage of the 6175 // instruction that is the key. 6176 using IntervalMap = DenseMap<Instruction *, unsigned>; 6177 6178 // Maps instruction to its index. 6179 SmallVector<Instruction *, 64> IdxToInstr; 6180 // Marks the end of each interval. 6181 IntervalMap EndPoint; 6182 // Saves the list of instruction indices that are used in the loop. 6183 SmallPtrSet<Instruction *, 8> Ends; 6184 // Saves the list of values that are used in the loop but are 6185 // defined outside the loop, such as arguments and constants. 6186 SmallPtrSet<Value *, 8> LoopInvariants; 6187 6188 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6189 for (Instruction &I : BB->instructionsWithoutDebug()) { 6190 IdxToInstr.push_back(&I); 6191 6192 // Save the end location of each USE. 6193 for (Value *U : I.operands()) { 6194 auto *Instr = dyn_cast<Instruction>(U); 6195 6196 // Ignore non-instruction values such as arguments, constants, etc. 6197 if (!Instr) 6198 continue; 6199 6200 // If this instruction is outside the loop then record it and continue. 6201 if (!TheLoop->contains(Instr)) { 6202 LoopInvariants.insert(Instr); 6203 continue; 6204 } 6205 6206 // Overwrite previous end points. 6207 EndPoint[Instr] = IdxToInstr.size(); 6208 Ends.insert(Instr); 6209 } 6210 } 6211 } 6212 6213 // Saves the list of intervals that end with the index in 'key'. 6214 using InstrList = SmallVector<Instruction *, 2>; 6215 DenseMap<unsigned, InstrList> TransposeEnds; 6216 6217 // Transpose the EndPoints to a list of values that end at each index. 6218 for (auto &Interval : EndPoint) 6219 TransposeEnds[Interval.second].push_back(Interval.first); 6220 6221 SmallPtrSet<Instruction *, 8> OpenIntervals; 6222 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6223 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6224 6225 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6226 6227 // A lambda that gets the register usage for the given type and VF. 6228 const auto &TTICapture = TTI; 6229 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6230 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6231 return 0U; 6232 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6233 }; 6234 6235 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6236 Instruction *I = IdxToInstr[i]; 6237 6238 // Remove all of the instructions that end at this location. 6239 InstrList &List = TransposeEnds[i]; 6240 for (Instruction *ToRemove : List) 6241 OpenIntervals.erase(ToRemove); 6242 6243 // Ignore instructions that are never used within the loop. 6244 if (!Ends.count(I)) 6245 continue; 6246 6247 // Skip ignored values. 6248 if (ValuesToIgnore.count(I)) 6249 continue; 6250 6251 // For each VF find the maximum usage of registers. 6252 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6253 // Count the number of live intervals. 6254 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6255 6256 if (VFs[j].isScalar()) { 6257 for (auto Inst : OpenIntervals) { 6258 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6259 if (RegUsage.find(ClassID) == RegUsage.end()) 6260 RegUsage[ClassID] = 1; 6261 else 6262 RegUsage[ClassID] += 1; 6263 } 6264 } else { 6265 collectUniformsAndScalars(VFs[j]); 6266 for (auto Inst : OpenIntervals) { 6267 // Skip ignored values for VF > 1. 6268 if (VecValuesToIgnore.count(Inst)) 6269 continue; 6270 if (isScalarAfterVectorization(Inst, VFs[j])) { 6271 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6272 if (RegUsage.find(ClassID) == RegUsage.end()) 6273 RegUsage[ClassID] = 1; 6274 else 6275 RegUsage[ClassID] += 1; 6276 } else { 6277 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6278 if (RegUsage.find(ClassID) == RegUsage.end()) 6279 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6280 else 6281 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6282 } 6283 } 6284 } 6285 6286 for (auto& pair : RegUsage) { 6287 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6288 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6289 else 6290 MaxUsages[j][pair.first] = pair.second; 6291 } 6292 } 6293 6294 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6295 << OpenIntervals.size() << '\n'); 6296 6297 // Add the current instruction to the list of open intervals. 6298 OpenIntervals.insert(I); 6299 } 6300 6301 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6302 SmallMapVector<unsigned, unsigned, 4> Invariant; 6303 6304 for (auto Inst : LoopInvariants) { 6305 unsigned Usage = 6306 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6307 unsigned ClassID = 6308 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6309 if (Invariant.find(ClassID) == Invariant.end()) 6310 Invariant[ClassID] = Usage; 6311 else 6312 Invariant[ClassID] += Usage; 6313 } 6314 6315 LLVM_DEBUG({ 6316 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6317 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6318 << " item\n"; 6319 for (const auto &pair : MaxUsages[i]) { 6320 dbgs() << "LV(REG): RegisterClass: " 6321 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6322 << " registers\n"; 6323 } 6324 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6325 << " item\n"; 6326 for (const auto &pair : Invariant) { 6327 dbgs() << "LV(REG): RegisterClass: " 6328 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6329 << " registers\n"; 6330 } 6331 }); 6332 6333 RU.LoopInvariantRegs = Invariant; 6334 RU.MaxLocalUsers = MaxUsages[i]; 6335 RUs[i] = RU; 6336 } 6337 6338 return RUs; 6339 } 6340 6341 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6342 // TODO: Cost model for emulated masked load/store is completely 6343 // broken. This hack guides the cost model to use an artificially 6344 // high enough value to practically disable vectorization with such 6345 // operations, except where previously deployed legality hack allowed 6346 // using very low cost values. This is to avoid regressions coming simply 6347 // from moving "masked load/store" check from legality to cost model. 6348 // Masked Load/Gather emulation was previously never allowed. 6349 // Limited number of Masked Store/Scatter emulation was allowed. 6350 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6351 return isa<LoadInst>(I) || 6352 (isa<StoreInst>(I) && 6353 NumPredStores > NumberOfStoresToPredicate); 6354 } 6355 6356 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6357 // If we aren't vectorizing the loop, or if we've already collected the 6358 // instructions to scalarize, there's nothing to do. Collection may already 6359 // have occurred if we have a user-selected VF and are now computing the 6360 // expected cost for interleaving. 6361 if (VF.isScalar() || VF.isZero() || 6362 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6363 return; 6364 6365 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6366 // not profitable to scalarize any instructions, the presence of VF in the 6367 // map will indicate that we've analyzed it already. 6368 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6369 6370 // Find all the instructions that are scalar with predication in the loop and 6371 // determine if it would be better to not if-convert the blocks they are in. 6372 // If so, we also record the instructions to scalarize. 6373 for (BasicBlock *BB : TheLoop->blocks()) { 6374 if (!blockNeedsPredication(BB)) 6375 continue; 6376 for (Instruction &I : *BB) 6377 if (isScalarWithPredication(&I)) { 6378 ScalarCostsTy ScalarCosts; 6379 // Do not apply discount logic if hacked cost is needed 6380 // for emulated masked memrefs. 6381 if (!useEmulatedMaskMemRefHack(&I) && 6382 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6383 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6384 // Remember that BB will remain after vectorization. 6385 PredicatedBBsAfterVectorization.insert(BB); 6386 } 6387 } 6388 } 6389 6390 int LoopVectorizationCostModel::computePredInstDiscount( 6391 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6392 ElementCount VF) { 6393 assert(!isUniformAfterVectorization(PredInst, VF) && 6394 "Instruction marked uniform-after-vectorization will be predicated"); 6395 6396 // Initialize the discount to zero, meaning that the scalar version and the 6397 // vector version cost the same. 6398 int Discount = 0; 6399 6400 // Holds instructions to analyze. The instructions we visit are mapped in 6401 // ScalarCosts. Those instructions are the ones that would be scalarized if 6402 // we find that the scalar version costs less. 6403 SmallVector<Instruction *, 8> Worklist; 6404 6405 // Returns true if the given instruction can be scalarized. 6406 auto canBeScalarized = [&](Instruction *I) -> bool { 6407 // We only attempt to scalarize instructions forming a single-use chain 6408 // from the original predicated block that would otherwise be vectorized. 6409 // Although not strictly necessary, we give up on instructions we know will 6410 // already be scalar to avoid traversing chains that are unlikely to be 6411 // beneficial. 6412 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6413 isScalarAfterVectorization(I, VF)) 6414 return false; 6415 6416 // If the instruction is scalar with predication, it will be analyzed 6417 // separately. We ignore it within the context of PredInst. 6418 if (isScalarWithPredication(I)) 6419 return false; 6420 6421 // If any of the instruction's operands are uniform after vectorization, 6422 // the instruction cannot be scalarized. This prevents, for example, a 6423 // masked load from being scalarized. 6424 // 6425 // We assume we will only emit a value for lane zero of an instruction 6426 // marked uniform after vectorization, rather than VF identical values. 6427 // Thus, if we scalarize an instruction that uses a uniform, we would 6428 // create uses of values corresponding to the lanes we aren't emitting code 6429 // for. This behavior can be changed by allowing getScalarValue to clone 6430 // the lane zero values for uniforms rather than asserting. 6431 for (Use &U : I->operands()) 6432 if (auto *J = dyn_cast<Instruction>(U.get())) 6433 if (isUniformAfterVectorization(J, VF)) 6434 return false; 6435 6436 // Otherwise, we can scalarize the instruction. 6437 return true; 6438 }; 6439 6440 // Compute the expected cost discount from scalarizing the entire expression 6441 // feeding the predicated instruction. We currently only consider expressions 6442 // that are single-use instruction chains. 6443 Worklist.push_back(PredInst); 6444 while (!Worklist.empty()) { 6445 Instruction *I = Worklist.pop_back_val(); 6446 6447 // If we've already analyzed the instruction, there's nothing to do. 6448 if (ScalarCosts.find(I) != ScalarCosts.end()) 6449 continue; 6450 6451 // Compute the cost of the vector instruction. Note that this cost already 6452 // includes the scalarization overhead of the predicated instruction. 6453 unsigned VectorCost = getInstructionCost(I, VF).first; 6454 6455 // Compute the cost of the scalarized instruction. This cost is the cost of 6456 // the instruction as if it wasn't if-converted and instead remained in the 6457 // predicated block. We will scale this cost by block probability after 6458 // computing the scalarization overhead. 6459 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6460 unsigned ScalarCost = 6461 VF.getKnownMinValue() * 6462 getInstructionCost(I, ElementCount::getFixed(1)).first; 6463 6464 // Compute the scalarization overhead of needed insertelement instructions 6465 // and phi nodes. 6466 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6467 ScalarCost += TTI.getScalarizationOverhead( 6468 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6469 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6470 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6471 ScalarCost += 6472 VF.getKnownMinValue() * 6473 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6474 } 6475 6476 // Compute the scalarization overhead of needed extractelement 6477 // instructions. For each of the instruction's operands, if the operand can 6478 // be scalarized, add it to the worklist; otherwise, account for the 6479 // overhead. 6480 for (Use &U : I->operands()) 6481 if (auto *J = dyn_cast<Instruction>(U.get())) { 6482 assert(VectorType::isValidElementType(J->getType()) && 6483 "Instruction has non-scalar type"); 6484 if (canBeScalarized(J)) 6485 Worklist.push_back(J); 6486 else if (needsExtract(J, VF)) { 6487 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6488 ScalarCost += TTI.getScalarizationOverhead( 6489 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6490 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6491 } 6492 } 6493 6494 // Scale the total scalar cost by block probability. 6495 ScalarCost /= getReciprocalPredBlockProb(); 6496 6497 // Compute the discount. A non-negative discount means the vector version 6498 // of the instruction costs more, and scalarizing would be beneficial. 6499 Discount += VectorCost - ScalarCost; 6500 ScalarCosts[I] = ScalarCost; 6501 } 6502 6503 return Discount; 6504 } 6505 6506 LoopVectorizationCostModel::VectorizationCostTy 6507 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6508 VectorizationCostTy Cost; 6509 6510 // For each block. 6511 for (BasicBlock *BB : TheLoop->blocks()) { 6512 VectorizationCostTy BlockCost; 6513 6514 // For each instruction in the old loop. 6515 for (Instruction &I : BB->instructionsWithoutDebug()) { 6516 // Skip ignored values. 6517 if (ValuesToIgnore.count(&I) || 6518 (VF.isVector() && VecValuesToIgnore.count(&I))) 6519 continue; 6520 6521 VectorizationCostTy C = getInstructionCost(&I, VF); 6522 6523 // Check if we should override the cost. 6524 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6525 C.first = ForceTargetInstructionCost; 6526 6527 BlockCost.first += C.first; 6528 BlockCost.second |= C.second; 6529 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6530 << " for VF " << VF << " For instruction: " << I 6531 << '\n'); 6532 } 6533 6534 // If we are vectorizing a predicated block, it will have been 6535 // if-converted. This means that the block's instructions (aside from 6536 // stores and instructions that may divide by zero) will now be 6537 // unconditionally executed. For the scalar case, we may not always execute 6538 // the predicated block, if it is an if-else block. Thus, scale the block's 6539 // cost by the probability of executing it. blockNeedsPredication from 6540 // Legal is used so as to not include all blocks in tail folded loops. 6541 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6542 BlockCost.first /= getReciprocalPredBlockProb(); 6543 6544 Cost.first += BlockCost.first; 6545 Cost.second |= BlockCost.second; 6546 } 6547 6548 return Cost; 6549 } 6550 6551 /// Gets Address Access SCEV after verifying that the access pattern 6552 /// is loop invariant except the induction variable dependence. 6553 /// 6554 /// This SCEV can be sent to the Target in order to estimate the address 6555 /// calculation cost. 6556 static const SCEV *getAddressAccessSCEV( 6557 Value *Ptr, 6558 LoopVectorizationLegality *Legal, 6559 PredicatedScalarEvolution &PSE, 6560 const Loop *TheLoop) { 6561 6562 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6563 if (!Gep) 6564 return nullptr; 6565 6566 // We are looking for a gep with all loop invariant indices except for one 6567 // which should be an induction variable. 6568 auto SE = PSE.getSE(); 6569 unsigned NumOperands = Gep->getNumOperands(); 6570 for (unsigned i = 1; i < NumOperands; ++i) { 6571 Value *Opd = Gep->getOperand(i); 6572 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6573 !Legal->isInductionVariable(Opd)) 6574 return nullptr; 6575 } 6576 6577 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6578 return PSE.getSCEV(Ptr); 6579 } 6580 6581 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6582 return Legal->hasStride(I->getOperand(0)) || 6583 Legal->hasStride(I->getOperand(1)); 6584 } 6585 6586 unsigned 6587 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6588 ElementCount VF) { 6589 assert(VF.isVector() && 6590 "Scalarization cost of instruction implies vectorization."); 6591 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6592 Type *ValTy = getMemInstValueType(I); 6593 auto SE = PSE.getSE(); 6594 6595 unsigned AS = getLoadStoreAddressSpace(I); 6596 Value *Ptr = getLoadStorePointerOperand(I); 6597 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6598 6599 // Figure out whether the access is strided and get the stride value 6600 // if it's known in compile time 6601 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6602 6603 // Get the cost of the scalar memory instruction and address computation. 6604 unsigned Cost = 6605 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6606 6607 // Don't pass *I here, since it is scalar but will actually be part of a 6608 // vectorized loop where the user of it is a vectorized instruction. 6609 const Align Alignment = getLoadStoreAlignment(I); 6610 Cost += VF.getKnownMinValue() * 6611 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6612 AS, TTI::TCK_RecipThroughput); 6613 6614 // Get the overhead of the extractelement and insertelement instructions 6615 // we might create due to scalarization. 6616 Cost += getScalarizationOverhead(I, VF); 6617 6618 // If we have a predicated store, it may not be executed for each vector 6619 // lane. Scale the cost by the probability of executing the predicated 6620 // block. 6621 if (isPredicatedInst(I)) { 6622 Cost /= getReciprocalPredBlockProb(); 6623 6624 if (useEmulatedMaskMemRefHack(I)) 6625 // Artificially setting to a high enough value to practically disable 6626 // vectorization with such operations. 6627 Cost = 3000000; 6628 } 6629 6630 return Cost; 6631 } 6632 6633 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6634 ElementCount VF) { 6635 Type *ValTy = getMemInstValueType(I); 6636 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6637 Value *Ptr = getLoadStorePointerOperand(I); 6638 unsigned AS = getLoadStoreAddressSpace(I); 6639 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6640 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6641 6642 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6643 "Stride should be 1 or -1 for consecutive memory access"); 6644 const Align Alignment = getLoadStoreAlignment(I); 6645 unsigned Cost = 0; 6646 if (Legal->isMaskRequired(I)) 6647 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6648 CostKind); 6649 else 6650 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6651 CostKind, I); 6652 6653 bool Reverse = ConsecutiveStride < 0; 6654 if (Reverse) 6655 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6656 return Cost; 6657 } 6658 6659 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6660 ElementCount VF) { 6661 assert(Legal->isUniformMemOp(*I)); 6662 6663 Type *ValTy = getMemInstValueType(I); 6664 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6665 const Align Alignment = getLoadStoreAlignment(I); 6666 unsigned AS = getLoadStoreAddressSpace(I); 6667 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6668 if (isa<LoadInst>(I)) { 6669 return TTI.getAddressComputationCost(ValTy) + 6670 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6671 CostKind) + 6672 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6673 } 6674 StoreInst *SI = cast<StoreInst>(I); 6675 6676 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6677 return TTI.getAddressComputationCost(ValTy) + 6678 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6679 CostKind) + 6680 (isLoopInvariantStoreValue 6681 ? 0 6682 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6683 VF.getKnownMinValue() - 1)); 6684 } 6685 6686 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6687 ElementCount VF) { 6688 Type *ValTy = getMemInstValueType(I); 6689 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6690 const Align Alignment = getLoadStoreAlignment(I); 6691 const Value *Ptr = getLoadStorePointerOperand(I); 6692 6693 return TTI.getAddressComputationCost(VectorTy) + 6694 TTI.getGatherScatterOpCost( 6695 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6696 TargetTransformInfo::TCK_RecipThroughput, I); 6697 } 6698 6699 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6700 ElementCount VF) { 6701 Type *ValTy = getMemInstValueType(I); 6702 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6703 unsigned AS = getLoadStoreAddressSpace(I); 6704 6705 auto Group = getInterleavedAccessGroup(I); 6706 assert(Group && "Fail to get an interleaved access group."); 6707 6708 unsigned InterleaveFactor = Group->getFactor(); 6709 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6710 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6711 6712 // Holds the indices of existing members in an interleaved load group. 6713 // An interleaved store group doesn't need this as it doesn't allow gaps. 6714 SmallVector<unsigned, 4> Indices; 6715 if (isa<LoadInst>(I)) { 6716 for (unsigned i = 0; i < InterleaveFactor; i++) 6717 if (Group->getMember(i)) 6718 Indices.push_back(i); 6719 } 6720 6721 // Calculate the cost of the whole interleaved group. 6722 bool UseMaskForGaps = 6723 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6724 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6725 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6726 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6727 6728 if (Group->isReverse()) { 6729 // TODO: Add support for reversed masked interleaved access. 6730 assert(!Legal->isMaskRequired(I) && 6731 "Reverse masked interleaved access not supported."); 6732 Cost += Group->getNumMembers() * 6733 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6734 } 6735 return Cost; 6736 } 6737 6738 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6739 ElementCount VF) { 6740 // Calculate scalar cost only. Vectorization cost should be ready at this 6741 // moment. 6742 if (VF.isScalar()) { 6743 Type *ValTy = getMemInstValueType(I); 6744 const Align Alignment = getLoadStoreAlignment(I); 6745 unsigned AS = getLoadStoreAddressSpace(I); 6746 6747 return TTI.getAddressComputationCost(ValTy) + 6748 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6749 TTI::TCK_RecipThroughput, I); 6750 } 6751 return getWideningCost(I, VF); 6752 } 6753 6754 LoopVectorizationCostModel::VectorizationCostTy 6755 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6756 ElementCount VF) { 6757 // If we know that this instruction will remain uniform, check the cost of 6758 // the scalar version. 6759 if (isUniformAfterVectorization(I, VF)) 6760 VF = ElementCount::getFixed(1); 6761 6762 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6763 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6764 6765 // Forced scalars do not have any scalarization overhead. 6766 auto ForcedScalar = ForcedScalars.find(VF); 6767 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6768 auto InstSet = ForcedScalar->second; 6769 if (InstSet.count(I)) 6770 return VectorizationCostTy( 6771 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6772 VF.getKnownMinValue()), 6773 false); 6774 } 6775 6776 Type *VectorTy; 6777 unsigned C = getInstructionCost(I, VF, VectorTy); 6778 6779 bool TypeNotScalarized = 6780 VF.isVector() && VectorTy->isVectorTy() && 6781 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6782 return VectorizationCostTy(C, TypeNotScalarized); 6783 } 6784 6785 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6786 ElementCount VF) { 6787 6788 assert(!VF.isScalable() && 6789 "cannot compute scalarization overhead for scalable vectorization"); 6790 if (VF.isScalar()) 6791 return 0; 6792 6793 unsigned Cost = 0; 6794 Type *RetTy = ToVectorTy(I->getType(), VF); 6795 if (!RetTy->isVoidTy() && 6796 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6797 Cost += TTI.getScalarizationOverhead( 6798 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6799 true, false); 6800 6801 // Some targets keep addresses scalar. 6802 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6803 return Cost; 6804 6805 // Some targets support efficient element stores. 6806 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6807 return Cost; 6808 6809 // Collect operands to consider. 6810 CallInst *CI = dyn_cast<CallInst>(I); 6811 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6812 6813 // Skip operands that do not require extraction/scalarization and do not incur 6814 // any overhead. 6815 return Cost + TTI.getOperandsScalarizationOverhead( 6816 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6817 } 6818 6819 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6820 if (VF.isScalar()) 6821 return; 6822 NumPredStores = 0; 6823 for (BasicBlock *BB : TheLoop->blocks()) { 6824 // For each instruction in the old loop. 6825 for (Instruction &I : *BB) { 6826 Value *Ptr = getLoadStorePointerOperand(&I); 6827 if (!Ptr) 6828 continue; 6829 6830 // TODO: We should generate better code and update the cost model for 6831 // predicated uniform stores. Today they are treated as any other 6832 // predicated store (see added test cases in 6833 // invariant-store-vectorization.ll). 6834 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6835 NumPredStores++; 6836 6837 if (Legal->isUniformMemOp(I)) { 6838 // TODO: Avoid replicating loads and stores instead of 6839 // relying on instcombine to remove them. 6840 // Load: Scalar load + broadcast 6841 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6842 unsigned Cost = getUniformMemOpCost(&I, VF); 6843 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6844 continue; 6845 } 6846 6847 // We assume that widening is the best solution when possible. 6848 if (memoryInstructionCanBeWidened(&I, VF)) { 6849 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6850 int ConsecutiveStride = 6851 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6852 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6853 "Expected consecutive stride."); 6854 InstWidening Decision = 6855 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6856 setWideningDecision(&I, VF, Decision, Cost); 6857 continue; 6858 } 6859 6860 // Choose between Interleaving, Gather/Scatter or Scalarization. 6861 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6862 unsigned NumAccesses = 1; 6863 if (isAccessInterleaved(&I)) { 6864 auto Group = getInterleavedAccessGroup(&I); 6865 assert(Group && "Fail to get an interleaved access group."); 6866 6867 // Make one decision for the whole group. 6868 if (getWideningDecision(&I, VF) != CM_Unknown) 6869 continue; 6870 6871 NumAccesses = Group->getNumMembers(); 6872 if (interleavedAccessCanBeWidened(&I, VF)) 6873 InterleaveCost = getInterleaveGroupCost(&I, VF); 6874 } 6875 6876 unsigned GatherScatterCost = 6877 isLegalGatherOrScatter(&I) 6878 ? getGatherScatterCost(&I, VF) * NumAccesses 6879 : std::numeric_limits<unsigned>::max(); 6880 6881 unsigned ScalarizationCost = 6882 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6883 6884 // Choose better solution for the current VF, 6885 // write down this decision and use it during vectorization. 6886 unsigned Cost; 6887 InstWidening Decision; 6888 if (InterleaveCost <= GatherScatterCost && 6889 InterleaveCost < ScalarizationCost) { 6890 Decision = CM_Interleave; 6891 Cost = InterleaveCost; 6892 } else if (GatherScatterCost < ScalarizationCost) { 6893 Decision = CM_GatherScatter; 6894 Cost = GatherScatterCost; 6895 } else { 6896 Decision = CM_Scalarize; 6897 Cost = ScalarizationCost; 6898 } 6899 // If the instructions belongs to an interleave group, the whole group 6900 // receives the same decision. The whole group receives the cost, but 6901 // the cost will actually be assigned to one instruction. 6902 if (auto Group = getInterleavedAccessGroup(&I)) 6903 setWideningDecision(Group, VF, Decision, Cost); 6904 else 6905 setWideningDecision(&I, VF, Decision, Cost); 6906 } 6907 } 6908 6909 // Make sure that any load of address and any other address computation 6910 // remains scalar unless there is gather/scatter support. This avoids 6911 // inevitable extracts into address registers, and also has the benefit of 6912 // activating LSR more, since that pass can't optimize vectorized 6913 // addresses. 6914 if (TTI.prefersVectorizedAddressing()) 6915 return; 6916 6917 // Start with all scalar pointer uses. 6918 SmallPtrSet<Instruction *, 8> AddrDefs; 6919 for (BasicBlock *BB : TheLoop->blocks()) 6920 for (Instruction &I : *BB) { 6921 Instruction *PtrDef = 6922 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6923 if (PtrDef && TheLoop->contains(PtrDef) && 6924 getWideningDecision(&I, VF) != CM_GatherScatter) 6925 AddrDefs.insert(PtrDef); 6926 } 6927 6928 // Add all instructions used to generate the addresses. 6929 SmallVector<Instruction *, 4> Worklist; 6930 for (auto *I : AddrDefs) 6931 Worklist.push_back(I); 6932 while (!Worklist.empty()) { 6933 Instruction *I = Worklist.pop_back_val(); 6934 for (auto &Op : I->operands()) 6935 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6936 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6937 AddrDefs.insert(InstOp).second) 6938 Worklist.push_back(InstOp); 6939 } 6940 6941 for (auto *I : AddrDefs) { 6942 if (isa<LoadInst>(I)) { 6943 // Setting the desired widening decision should ideally be handled in 6944 // by cost functions, but since this involves the task of finding out 6945 // if the loaded register is involved in an address computation, it is 6946 // instead changed here when we know this is the case. 6947 InstWidening Decision = getWideningDecision(I, VF); 6948 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6949 // Scalarize a widened load of address. 6950 setWideningDecision( 6951 I, VF, CM_Scalarize, 6952 (VF.getKnownMinValue() * 6953 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6954 else if (auto Group = getInterleavedAccessGroup(I)) { 6955 // Scalarize an interleave group of address loads. 6956 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6957 if (Instruction *Member = Group->getMember(I)) 6958 setWideningDecision( 6959 Member, VF, CM_Scalarize, 6960 (VF.getKnownMinValue() * 6961 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6962 } 6963 } 6964 } else 6965 // Make sure I gets scalarized and a cost estimate without 6966 // scalarization overhead. 6967 ForcedScalars[VF].insert(I); 6968 } 6969 } 6970 6971 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6972 ElementCount VF, 6973 Type *&VectorTy) { 6974 Type *RetTy = I->getType(); 6975 if (canTruncateToMinimalBitwidth(I, VF)) 6976 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6977 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6978 auto SE = PSE.getSE(); 6979 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6980 6981 // TODO: We need to estimate the cost of intrinsic calls. 6982 switch (I->getOpcode()) { 6983 case Instruction::GetElementPtr: 6984 // We mark this instruction as zero-cost because the cost of GEPs in 6985 // vectorized code depends on whether the corresponding memory instruction 6986 // is scalarized or not. Therefore, we handle GEPs with the memory 6987 // instruction cost. 6988 return 0; 6989 case Instruction::Br: { 6990 // In cases of scalarized and predicated instructions, there will be VF 6991 // predicated blocks in the vectorized loop. Each branch around these 6992 // blocks requires also an extract of its vector compare i1 element. 6993 bool ScalarPredicatedBB = false; 6994 BranchInst *BI = cast<BranchInst>(I); 6995 if (VF.isVector() && BI->isConditional() && 6996 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6997 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6998 ScalarPredicatedBB = true; 6999 7000 if (ScalarPredicatedBB) { 7001 // Return cost for branches around scalarized and predicated blocks. 7002 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7003 auto *Vec_i1Ty = 7004 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7005 return (TTI.getScalarizationOverhead( 7006 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7007 false, true) + 7008 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7009 VF.getKnownMinValue())); 7010 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7011 // The back-edge branch will remain, as will all scalar branches. 7012 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7013 else 7014 // This branch will be eliminated by if-conversion. 7015 return 0; 7016 // Note: We currently assume zero cost for an unconditional branch inside 7017 // a predicated block since it will become a fall-through, although we 7018 // may decide in the future to call TTI for all branches. 7019 } 7020 case Instruction::PHI: { 7021 auto *Phi = cast<PHINode>(I); 7022 7023 // First-order recurrences are replaced by vector shuffles inside the loop. 7024 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7025 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7026 return TTI.getShuffleCost( 7027 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7028 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7029 7030 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7031 // converted into select instructions. We require N - 1 selects per phi 7032 // node, where N is the number of incoming values. 7033 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7034 return (Phi->getNumIncomingValues() - 1) * 7035 TTI.getCmpSelInstrCost( 7036 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7037 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7038 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7039 7040 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7041 } 7042 case Instruction::UDiv: 7043 case Instruction::SDiv: 7044 case Instruction::URem: 7045 case Instruction::SRem: 7046 // If we have a predicated instruction, it may not be executed for each 7047 // vector lane. Get the scalarization cost and scale this amount by the 7048 // probability of executing the predicated block. If the instruction is not 7049 // predicated, we fall through to the next case. 7050 if (VF.isVector() && isScalarWithPredication(I)) { 7051 unsigned Cost = 0; 7052 7053 // These instructions have a non-void type, so account for the phi nodes 7054 // that we will create. This cost is likely to be zero. The phi node 7055 // cost, if any, should be scaled by the block probability because it 7056 // models a copy at the end of each predicated block. 7057 Cost += VF.getKnownMinValue() * 7058 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7059 7060 // The cost of the non-predicated instruction. 7061 Cost += VF.getKnownMinValue() * 7062 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7063 7064 // The cost of insertelement and extractelement instructions needed for 7065 // scalarization. 7066 Cost += getScalarizationOverhead(I, VF); 7067 7068 // Scale the cost by the probability of executing the predicated blocks. 7069 // This assumes the predicated block for each vector lane is equally 7070 // likely. 7071 return Cost / getReciprocalPredBlockProb(); 7072 } 7073 LLVM_FALLTHROUGH; 7074 case Instruction::Add: 7075 case Instruction::FAdd: 7076 case Instruction::Sub: 7077 case Instruction::FSub: 7078 case Instruction::Mul: 7079 case Instruction::FMul: 7080 case Instruction::FDiv: 7081 case Instruction::FRem: 7082 case Instruction::Shl: 7083 case Instruction::LShr: 7084 case Instruction::AShr: 7085 case Instruction::And: 7086 case Instruction::Or: 7087 case Instruction::Xor: { 7088 // Since we will replace the stride by 1 the multiplication should go away. 7089 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7090 return 0; 7091 // Certain instructions can be cheaper to vectorize if they have a constant 7092 // second vector operand. One example of this are shifts on x86. 7093 Value *Op2 = I->getOperand(1); 7094 TargetTransformInfo::OperandValueProperties Op2VP; 7095 TargetTransformInfo::OperandValueKind Op2VK = 7096 TTI.getOperandInfo(Op2, Op2VP); 7097 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7098 Op2VK = TargetTransformInfo::OK_UniformValue; 7099 7100 SmallVector<const Value *, 4> Operands(I->operand_values()); 7101 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7102 return N * TTI.getArithmeticInstrCost( 7103 I->getOpcode(), VectorTy, CostKind, 7104 TargetTransformInfo::OK_AnyValue, 7105 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7106 } 7107 case Instruction::FNeg: { 7108 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7109 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7110 return N * TTI.getArithmeticInstrCost( 7111 I->getOpcode(), VectorTy, CostKind, 7112 TargetTransformInfo::OK_AnyValue, 7113 TargetTransformInfo::OK_AnyValue, 7114 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7115 I->getOperand(0), I); 7116 } 7117 case Instruction::Select: { 7118 SelectInst *SI = cast<SelectInst>(I); 7119 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7120 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7121 Type *CondTy = SI->getCondition()->getType(); 7122 if (!ScalarCond) { 7123 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7124 CondTy = VectorType::get(CondTy, VF); 7125 } 7126 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7127 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7128 } 7129 case Instruction::ICmp: 7130 case Instruction::FCmp: { 7131 Type *ValTy = I->getOperand(0)->getType(); 7132 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7133 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7134 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7135 VectorTy = ToVectorTy(ValTy, VF); 7136 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7137 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7138 } 7139 case Instruction::Store: 7140 case Instruction::Load: { 7141 ElementCount Width = VF; 7142 if (Width.isVector()) { 7143 InstWidening Decision = getWideningDecision(I, Width); 7144 assert(Decision != CM_Unknown && 7145 "CM decision should be taken at this point"); 7146 if (Decision == CM_Scalarize) 7147 Width = ElementCount::getFixed(1); 7148 } 7149 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7150 return getMemoryInstructionCost(I, VF); 7151 } 7152 case Instruction::ZExt: 7153 case Instruction::SExt: 7154 case Instruction::FPToUI: 7155 case Instruction::FPToSI: 7156 case Instruction::FPExt: 7157 case Instruction::PtrToInt: 7158 case Instruction::IntToPtr: 7159 case Instruction::SIToFP: 7160 case Instruction::UIToFP: 7161 case Instruction::Trunc: 7162 case Instruction::FPTrunc: 7163 case Instruction::BitCast: { 7164 // Computes the CastContextHint from a Load/Store instruction. 7165 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7166 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7167 "Expected a load or a store!"); 7168 7169 if (VF.isScalar() || !TheLoop->contains(I)) 7170 return TTI::CastContextHint::Normal; 7171 7172 switch (getWideningDecision(I, VF)) { 7173 case LoopVectorizationCostModel::CM_GatherScatter: 7174 return TTI::CastContextHint::GatherScatter; 7175 case LoopVectorizationCostModel::CM_Interleave: 7176 return TTI::CastContextHint::Interleave; 7177 case LoopVectorizationCostModel::CM_Scalarize: 7178 case LoopVectorizationCostModel::CM_Widen: 7179 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7180 : TTI::CastContextHint::Normal; 7181 case LoopVectorizationCostModel::CM_Widen_Reverse: 7182 return TTI::CastContextHint::Reversed; 7183 case LoopVectorizationCostModel::CM_Unknown: 7184 llvm_unreachable("Instr did not go through cost modelling?"); 7185 } 7186 7187 llvm_unreachable("Unhandled case!"); 7188 }; 7189 7190 unsigned Opcode = I->getOpcode(); 7191 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7192 // For Trunc, the context is the only user, which must be a StoreInst. 7193 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7194 if (I->hasOneUse()) 7195 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7196 CCH = ComputeCCH(Store); 7197 } 7198 // For Z/Sext, the context is the operand, which must be a LoadInst. 7199 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7200 Opcode == Instruction::FPExt) { 7201 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7202 CCH = ComputeCCH(Load); 7203 } 7204 7205 // We optimize the truncation of induction variables having constant 7206 // integer steps. The cost of these truncations is the same as the scalar 7207 // operation. 7208 if (isOptimizableIVTruncate(I, VF)) { 7209 auto *Trunc = cast<TruncInst>(I); 7210 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7211 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7212 } 7213 7214 Type *SrcScalarTy = I->getOperand(0)->getType(); 7215 Type *SrcVecTy = 7216 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7217 if (canTruncateToMinimalBitwidth(I, VF)) { 7218 // This cast is going to be shrunk. This may remove the cast or it might 7219 // turn it into slightly different cast. For example, if MinBW == 16, 7220 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7221 // 7222 // Calculate the modified src and dest types. 7223 Type *MinVecTy = VectorTy; 7224 if (Opcode == Instruction::Trunc) { 7225 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7226 VectorTy = 7227 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7228 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7229 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7230 VectorTy = 7231 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7232 } 7233 } 7234 7235 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7236 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7237 return N * 7238 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7239 } 7240 case Instruction::Call: { 7241 bool NeedToScalarize; 7242 CallInst *CI = cast<CallInst>(I); 7243 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7244 if (getVectorIntrinsicIDForCall(CI, TLI)) 7245 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7246 return CallCost; 7247 } 7248 case Instruction::ExtractValue: { 7249 InstructionCost ExtractCost = 7250 TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7251 assert(ExtractCost.isValid() && "Invalid cost for ExtractValue"); 7252 return *(ExtractCost.getValue()); 7253 } 7254 default: 7255 // The cost of executing VF copies of the scalar instruction. This opcode 7256 // is unknown. Assume that it is the same as 'mul'. 7257 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7258 Instruction::Mul, VectorTy, CostKind) + 7259 getScalarizationOverhead(I, VF); 7260 } // end of switch. 7261 } 7262 7263 char LoopVectorize::ID = 0; 7264 7265 static const char lv_name[] = "Loop Vectorization"; 7266 7267 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7268 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7269 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7270 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7271 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7272 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7273 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7274 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7275 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7276 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7277 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7278 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7279 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7280 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7281 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7282 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7283 7284 namespace llvm { 7285 7286 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7287 7288 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7289 bool VectorizeOnlyWhenForced) { 7290 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7291 } 7292 7293 } // end namespace llvm 7294 7295 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7296 // Check if the pointer operand of a load or store instruction is 7297 // consecutive. 7298 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7299 return Legal->isConsecutivePtr(Ptr); 7300 return false; 7301 } 7302 7303 void LoopVectorizationCostModel::collectValuesToIgnore() { 7304 // Ignore ephemeral values. 7305 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7306 7307 // Ignore type-promoting instructions we identified during reduction 7308 // detection. 7309 for (auto &Reduction : Legal->getReductionVars()) { 7310 RecurrenceDescriptor &RedDes = Reduction.second; 7311 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7312 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7313 } 7314 // Ignore type-casting instructions we identified during induction 7315 // detection. 7316 for (auto &Induction : Legal->getInductionVars()) { 7317 InductionDescriptor &IndDes = Induction.second; 7318 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7319 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7320 } 7321 } 7322 7323 void LoopVectorizationCostModel::collectInLoopReductions() { 7324 for (auto &Reduction : Legal->getReductionVars()) { 7325 PHINode *Phi = Reduction.first; 7326 RecurrenceDescriptor &RdxDesc = Reduction.second; 7327 7328 // We don't collect reductions that are type promoted (yet). 7329 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7330 continue; 7331 7332 // If the target would prefer this reduction to happen "in-loop", then we 7333 // want to record it as such. 7334 unsigned Opcode = RdxDesc.getRecurrenceBinOp(); 7335 if (!PreferInLoopReductions && 7336 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7337 TargetTransformInfo::ReductionFlags())) 7338 continue; 7339 7340 // Check that we can correctly put the reductions into the loop, by 7341 // finding the chain of operations that leads from the phi to the loop 7342 // exit value. 7343 SmallVector<Instruction *, 4> ReductionOperations = 7344 RdxDesc.getReductionOpChain(Phi, TheLoop); 7345 bool InLoop = !ReductionOperations.empty(); 7346 if (InLoop) 7347 InLoopReductionChains[Phi] = ReductionOperations; 7348 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7349 << " reduction for phi: " << *Phi << "\n"); 7350 } 7351 } 7352 7353 // TODO: we could return a pair of values that specify the max VF and 7354 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7355 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7356 // doesn't have a cost model that can choose which plan to execute if 7357 // more than one is generated. 7358 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7359 LoopVectorizationCostModel &CM) { 7360 unsigned WidestType; 7361 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7362 return WidestVectorRegBits / WidestType; 7363 } 7364 7365 VectorizationFactor 7366 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7367 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7368 ElementCount VF = UserVF; 7369 // Outer loop handling: They may require CFG and instruction level 7370 // transformations before even evaluating whether vectorization is profitable. 7371 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7372 // the vectorization pipeline. 7373 if (!OrigLoop->isInnermost()) { 7374 // If the user doesn't provide a vectorization factor, determine a 7375 // reasonable one. 7376 if (UserVF.isZero()) { 7377 VF = ElementCount::getFixed( 7378 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7379 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7380 7381 // Make sure we have a VF > 1 for stress testing. 7382 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7383 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7384 << "overriding computed VF.\n"); 7385 VF = ElementCount::getFixed(4); 7386 } 7387 } 7388 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7389 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7390 "VF needs to be a power of two"); 7391 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7392 << "VF " << VF << " to build VPlans.\n"); 7393 buildVPlans(VF, VF); 7394 7395 // For VPlan build stress testing, we bail out after VPlan construction. 7396 if (VPlanBuildStressTest) 7397 return VectorizationFactor::Disabled(); 7398 7399 return {VF, 0 /*Cost*/}; 7400 } 7401 7402 LLVM_DEBUG( 7403 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7404 "VPlan-native path.\n"); 7405 return VectorizationFactor::Disabled(); 7406 } 7407 7408 Optional<VectorizationFactor> 7409 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7410 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7411 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7412 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7413 return None; 7414 7415 // Invalidate interleave groups if all blocks of loop will be predicated. 7416 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7417 !useMaskedInterleavedAccesses(*TTI)) { 7418 LLVM_DEBUG( 7419 dbgs() 7420 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7421 "which requires masked-interleaved support.\n"); 7422 if (CM.InterleaveInfo.invalidateGroups()) 7423 // Invalidating interleave groups also requires invalidating all decisions 7424 // based on them, which includes widening decisions and uniform and scalar 7425 // values. 7426 CM.invalidateCostModelingDecisions(); 7427 } 7428 7429 ElementCount MaxVF = MaybeMaxVF.getValue(); 7430 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7431 7432 if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { 7433 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7434 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7435 "VF needs to be a power of two"); 7436 // Collect the instructions (and their associated costs) that will be more 7437 // profitable to scalarize. 7438 CM.selectUserVectorizationFactor(UserVF); 7439 CM.collectInLoopReductions(); 7440 buildVPlansWithVPRecipes(UserVF, UserVF); 7441 LLVM_DEBUG(printPlans(dbgs())); 7442 return {{UserVF, 0}}; 7443 } 7444 7445 assert(!MaxVF.isScalable() && 7446 "Scalable vectors not yet supported beyond this point"); 7447 7448 for (ElementCount VF = ElementCount::getFixed(1); 7449 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7450 // Collect Uniform and Scalar instructions after vectorization with VF. 7451 CM.collectUniformsAndScalars(VF); 7452 7453 // Collect the instructions (and their associated costs) that will be more 7454 // profitable to scalarize. 7455 if (VF.isVector()) 7456 CM.collectInstsToScalarize(VF); 7457 } 7458 7459 CM.collectInLoopReductions(); 7460 7461 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7462 LLVM_DEBUG(printPlans(dbgs())); 7463 if (MaxVF.isScalar()) 7464 return VectorizationFactor::Disabled(); 7465 7466 // Select the optimal vectorization factor. 7467 return CM.selectVectorizationFactor(MaxVF); 7468 } 7469 7470 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7471 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7472 << '\n'); 7473 BestVF = VF; 7474 BestUF = UF; 7475 7476 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7477 return !Plan->hasVF(VF); 7478 }); 7479 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7480 } 7481 7482 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7483 DominatorTree *DT) { 7484 // Perform the actual loop transformation. 7485 7486 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7487 VPCallbackILV CallbackILV(ILV); 7488 7489 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7490 7491 VPTransformState State{*BestVF, BestUF, LI, 7492 DT, ILV.Builder, ILV.VectorLoopValueMap, 7493 &ILV, CallbackILV}; 7494 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7495 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7496 State.CanonicalIV = ILV.Induction; 7497 7498 ILV.printDebugTracesAtStart(); 7499 7500 //===------------------------------------------------===// 7501 // 7502 // Notice: any optimization or new instruction that go 7503 // into the code below should also be implemented in 7504 // the cost-model. 7505 // 7506 //===------------------------------------------------===// 7507 7508 // 2. Copy and widen instructions from the old loop into the new loop. 7509 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7510 VPlans.front()->execute(&State); 7511 7512 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7513 // predication, updating analyses. 7514 ILV.fixVectorizedLoop(); 7515 7516 ILV.printDebugTracesAtEnd(); 7517 } 7518 7519 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7520 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7521 7522 // We create new control-flow for the vectorized loop, so the original exit 7523 // conditions will be dead after vectorization if it's only used by the 7524 // terminator 7525 SmallVector<BasicBlock*> ExitingBlocks; 7526 OrigLoop->getExitingBlocks(ExitingBlocks); 7527 for (auto *BB : ExitingBlocks) { 7528 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7529 if (!Cmp || !Cmp->hasOneUse()) 7530 continue; 7531 7532 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7533 if (!DeadInstructions.insert(Cmp).second) 7534 continue; 7535 7536 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7537 // TODO: can recurse through operands in general 7538 for (Value *Op : Cmp->operands()) { 7539 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7540 DeadInstructions.insert(cast<Instruction>(Op)); 7541 } 7542 } 7543 7544 // We create new "steps" for induction variable updates to which the original 7545 // induction variables map. An original update instruction will be dead if 7546 // all its users except the induction variable are dead. 7547 auto *Latch = OrigLoop->getLoopLatch(); 7548 for (auto &Induction : Legal->getInductionVars()) { 7549 PHINode *Ind = Induction.first; 7550 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7551 7552 // If the tail is to be folded by masking, the primary induction variable, 7553 // if exists, isn't dead: it will be used for masking. Don't kill it. 7554 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7555 continue; 7556 7557 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7558 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7559 })) 7560 DeadInstructions.insert(IndUpdate); 7561 7562 // We record as "Dead" also the type-casting instructions we had identified 7563 // during induction analysis. We don't need any handling for them in the 7564 // vectorized loop because we have proven that, under a proper runtime 7565 // test guarding the vectorized loop, the value of the phi, and the casted 7566 // value of the phi, are the same. The last instruction in this casting chain 7567 // will get its scalar/vector/widened def from the scalar/vector/widened def 7568 // of the respective phi node. Any other casts in the induction def-use chain 7569 // have no other uses outside the phi update chain, and will be ignored. 7570 InductionDescriptor &IndDes = Induction.second; 7571 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7572 DeadInstructions.insert(Casts.begin(), Casts.end()); 7573 } 7574 } 7575 7576 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7577 7578 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7579 7580 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7581 Instruction::BinaryOps BinOp) { 7582 // When unrolling and the VF is 1, we only need to add a simple scalar. 7583 Type *Ty = Val->getType(); 7584 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7585 7586 if (Ty->isFloatingPointTy()) { 7587 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7588 7589 // Floating point operations had to be 'fast' to enable the unrolling. 7590 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7591 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7592 } 7593 Constant *C = ConstantInt::get(Ty, StartIdx); 7594 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7595 } 7596 7597 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7598 SmallVector<Metadata *, 4> MDs; 7599 // Reserve first location for self reference to the LoopID metadata node. 7600 MDs.push_back(nullptr); 7601 bool IsUnrollMetadata = false; 7602 MDNode *LoopID = L->getLoopID(); 7603 if (LoopID) { 7604 // First find existing loop unrolling disable metadata. 7605 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7606 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7607 if (MD) { 7608 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7609 IsUnrollMetadata = 7610 S && S->getString().startswith("llvm.loop.unroll.disable"); 7611 } 7612 MDs.push_back(LoopID->getOperand(i)); 7613 } 7614 } 7615 7616 if (!IsUnrollMetadata) { 7617 // Add runtime unroll disable metadata. 7618 LLVMContext &Context = L->getHeader()->getContext(); 7619 SmallVector<Metadata *, 1> DisableOperands; 7620 DisableOperands.push_back( 7621 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7622 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7623 MDs.push_back(DisableNode); 7624 MDNode *NewLoopID = MDNode::get(Context, MDs); 7625 // Set operand 0 to refer to the loop id itself. 7626 NewLoopID->replaceOperandWith(0, NewLoopID); 7627 L->setLoopID(NewLoopID); 7628 } 7629 } 7630 7631 //===--------------------------------------------------------------------===// 7632 // EpilogueVectorizerMainLoop 7633 //===--------------------------------------------------------------------===// 7634 7635 /// This function is partially responsible for generating the control flow 7636 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7637 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7638 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7639 Loop *Lp = createVectorLoopSkeleton(""); 7640 7641 // Generate the code to check the minimum iteration count of the vector 7642 // epilogue (see below). 7643 EPI.EpilogueIterationCountCheck = 7644 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7645 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7646 7647 // Generate the code to check any assumptions that we've made for SCEV 7648 // expressions. 7649 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7650 emitSCEVChecks(Lp, LoopScalarPreHeader); 7651 7652 // If a safety check was generated save it. 7653 if (SavedPreHeader != LoopVectorPreHeader) 7654 EPI.SCEVSafetyCheck = SavedPreHeader; 7655 7656 // Generate the code that checks at runtime if arrays overlap. We put the 7657 // checks into a separate block to make the more common case of few elements 7658 // faster. 7659 SavedPreHeader = LoopVectorPreHeader; 7660 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7661 7662 // If a safety check was generated save/overwite it. 7663 if (SavedPreHeader != LoopVectorPreHeader) 7664 EPI.MemSafetyCheck = SavedPreHeader; 7665 7666 // Generate the iteration count check for the main loop, *after* the check 7667 // for the epilogue loop, so that the path-length is shorter for the case 7668 // that goes directly through the vector epilogue. The longer-path length for 7669 // the main loop is compensated for, by the gain from vectorizing the larger 7670 // trip count. Note: the branch will get updated later on when we vectorize 7671 // the epilogue. 7672 EPI.MainLoopIterationCountCheck = 7673 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7674 7675 // Generate the induction variable. 7676 OldInduction = Legal->getPrimaryInduction(); 7677 Type *IdxTy = Legal->getWidestInductionType(); 7678 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7679 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7680 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7681 EPI.VectorTripCount = CountRoundDown; 7682 Induction = 7683 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7684 getDebugLocFromInstOrOperands(OldInduction)); 7685 7686 // Skip induction resume value creation here because they will be created in 7687 // the second pass. If we created them here, they wouldn't be used anyway, 7688 // because the vplan in the second pass still contains the inductions from the 7689 // original loop. 7690 7691 return completeLoopSkeleton(Lp, OrigLoopID); 7692 } 7693 7694 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7695 LLVM_DEBUG({ 7696 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7697 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7698 << ", Main Loop UF:" << EPI.MainLoopUF 7699 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7700 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7701 }); 7702 } 7703 7704 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7705 DEBUG_WITH_TYPE(VerboseDebug, { 7706 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7707 }); 7708 } 7709 7710 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7711 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7712 assert(L && "Expected valid Loop."); 7713 assert(Bypass && "Expected valid bypass basic block."); 7714 unsigned VFactor = 7715 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7716 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7717 Value *Count = getOrCreateTripCount(L); 7718 // Reuse existing vector loop preheader for TC checks. 7719 // Note that new preheader block is generated for vector loop. 7720 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7721 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7722 7723 // Generate code to check if the loop's trip count is less than VF * UF of the 7724 // main vector loop. 7725 auto P = 7726 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7727 7728 Value *CheckMinIters = Builder.CreateICmp( 7729 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7730 "min.iters.check"); 7731 7732 if (!ForEpilogue) 7733 TCCheckBlock->setName("vector.main.loop.iter.check"); 7734 7735 // Create new preheader for vector loop. 7736 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7737 DT, LI, nullptr, "vector.ph"); 7738 7739 if (ForEpilogue) { 7740 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7741 DT->getNode(Bypass)->getIDom()) && 7742 "TC check is expected to dominate Bypass"); 7743 7744 // Update dominator for Bypass & LoopExit. 7745 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7746 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7747 7748 LoopBypassBlocks.push_back(TCCheckBlock); 7749 7750 // Save the trip count so we don't have to regenerate it in the 7751 // vec.epilog.iter.check. This is safe to do because the trip count 7752 // generated here dominates the vector epilog iter check. 7753 EPI.TripCount = Count; 7754 } 7755 7756 ReplaceInstWithInst( 7757 TCCheckBlock->getTerminator(), 7758 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7759 7760 return TCCheckBlock; 7761 } 7762 7763 //===--------------------------------------------------------------------===// 7764 // EpilogueVectorizerEpilogueLoop 7765 //===--------------------------------------------------------------------===// 7766 7767 /// This function is partially responsible for generating the control flow 7768 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7769 BasicBlock * 7770 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7771 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7772 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7773 7774 // Now, compare the remaining count and if there aren't enough iterations to 7775 // execute the vectorized epilogue skip to the scalar part. 7776 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7777 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7778 LoopVectorPreHeader = 7779 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7780 LI, nullptr, "vec.epilog.ph"); 7781 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7782 VecEpilogueIterationCountCheck); 7783 7784 // Adjust the control flow taking the state info from the main loop 7785 // vectorization into account. 7786 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7787 "expected this to be saved from the previous pass."); 7788 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7789 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7790 7791 DT->changeImmediateDominator(LoopVectorPreHeader, 7792 EPI.MainLoopIterationCountCheck); 7793 7794 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7795 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7796 7797 if (EPI.SCEVSafetyCheck) 7798 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7799 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7800 if (EPI.MemSafetyCheck) 7801 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7802 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7803 7804 DT->changeImmediateDominator( 7805 VecEpilogueIterationCountCheck, 7806 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7807 7808 DT->changeImmediateDominator(LoopScalarPreHeader, 7809 EPI.EpilogueIterationCountCheck); 7810 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7811 7812 // Keep track of bypass blocks, as they feed start values to the induction 7813 // phis in the scalar loop preheader. 7814 if (EPI.SCEVSafetyCheck) 7815 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7816 if (EPI.MemSafetyCheck) 7817 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7818 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7819 7820 // Generate a resume induction for the vector epilogue and put it in the 7821 // vector epilogue preheader 7822 Type *IdxTy = Legal->getWidestInductionType(); 7823 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7824 LoopVectorPreHeader->getFirstNonPHI()); 7825 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7826 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7827 EPI.MainLoopIterationCountCheck); 7828 7829 // Generate the induction variable. 7830 OldInduction = Legal->getPrimaryInduction(); 7831 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7832 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7833 Value *StartIdx = EPResumeVal; 7834 Induction = 7835 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7836 getDebugLocFromInstOrOperands(OldInduction)); 7837 7838 // Generate induction resume values. These variables save the new starting 7839 // indexes for the scalar loop. They are used to test if there are any tail 7840 // iterations left once the vector loop has completed. 7841 // Note that when the vectorized epilogue is skipped due to iteration count 7842 // check, then the resume value for the induction variable comes from 7843 // the trip count of the main vector loop, hence passing the AdditionalBypass 7844 // argument. 7845 createInductionResumeValues(Lp, CountRoundDown, 7846 {VecEpilogueIterationCountCheck, 7847 EPI.VectorTripCount} /* AdditionalBypass */); 7848 7849 AddRuntimeUnrollDisableMetaData(Lp); 7850 return completeLoopSkeleton(Lp, OrigLoopID); 7851 } 7852 7853 BasicBlock * 7854 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7855 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7856 7857 assert(EPI.TripCount && 7858 "Expected trip count to have been safed in the first pass."); 7859 assert( 7860 (!isa<Instruction>(EPI.TripCount) || 7861 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7862 "saved trip count does not dominate insertion point."); 7863 Value *TC = EPI.TripCount; 7864 IRBuilder<> Builder(Insert->getTerminator()); 7865 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7866 7867 // Generate code to check if the loop's trip count is less than VF * UF of the 7868 // vector epilogue loop. 7869 auto P = 7870 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7871 7872 Value *CheckMinIters = Builder.CreateICmp( 7873 P, Count, 7874 ConstantInt::get(Count->getType(), 7875 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7876 "min.epilog.iters.check"); 7877 7878 ReplaceInstWithInst( 7879 Insert->getTerminator(), 7880 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7881 7882 LoopBypassBlocks.push_back(Insert); 7883 return Insert; 7884 } 7885 7886 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7887 LLVM_DEBUG({ 7888 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7889 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7890 << ", Main Loop UF:" << EPI.MainLoopUF 7891 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7892 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7893 }); 7894 } 7895 7896 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7897 DEBUG_WITH_TYPE(VerboseDebug, { 7898 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7899 }); 7900 } 7901 7902 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7903 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7904 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7905 bool PredicateAtRangeStart = Predicate(Range.Start); 7906 7907 for (ElementCount TmpVF = Range.Start * 2; 7908 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7909 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7910 Range.End = TmpVF; 7911 break; 7912 } 7913 7914 return PredicateAtRangeStart; 7915 } 7916 7917 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7918 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7919 /// of VF's starting at a given VF and extending it as much as possible. Each 7920 /// vectorization decision can potentially shorten this sub-range during 7921 /// buildVPlan(). 7922 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7923 ElementCount MaxVF) { 7924 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7925 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7926 VFRange SubRange = {VF, MaxVFPlusOne}; 7927 VPlans.push_back(buildVPlan(SubRange)); 7928 VF = SubRange.End; 7929 } 7930 } 7931 7932 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7933 VPlanPtr &Plan) { 7934 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7935 7936 // Look for cached value. 7937 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7938 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7939 if (ECEntryIt != EdgeMaskCache.end()) 7940 return ECEntryIt->second; 7941 7942 VPValue *SrcMask = createBlockInMask(Src, Plan); 7943 7944 // The terminator has to be a branch inst! 7945 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7946 assert(BI && "Unexpected terminator found"); 7947 7948 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7949 return EdgeMaskCache[Edge] = SrcMask; 7950 7951 // If source is an exiting block, we know the exit edge is dynamically dead 7952 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7953 // adding uses of an otherwise potentially dead instruction. 7954 if (OrigLoop->isLoopExiting(Src)) 7955 return EdgeMaskCache[Edge] = SrcMask; 7956 7957 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7958 assert(EdgeMask && "No Edge Mask found for condition"); 7959 7960 if (BI->getSuccessor(0) != Dst) 7961 EdgeMask = Builder.createNot(EdgeMask); 7962 7963 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7964 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7965 7966 return EdgeMaskCache[Edge] = EdgeMask; 7967 } 7968 7969 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7970 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7971 7972 // Look for cached value. 7973 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7974 if (BCEntryIt != BlockMaskCache.end()) 7975 return BCEntryIt->second; 7976 7977 // All-one mask is modelled as no-mask following the convention for masked 7978 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7979 VPValue *BlockMask = nullptr; 7980 7981 if (OrigLoop->getHeader() == BB) { 7982 if (!CM.blockNeedsPredication(BB)) 7983 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7984 7985 // Create the block in mask as the first non-phi instruction in the block. 7986 VPBuilder::InsertPointGuard Guard(Builder); 7987 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7988 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7989 7990 // Introduce the early-exit compare IV <= BTC to form header block mask. 7991 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7992 // Start by constructing the desired canonical IV. 7993 VPValue *IV = nullptr; 7994 if (Legal->getPrimaryInduction()) 7995 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7996 else { 7997 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7998 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7999 IV = IVRecipe->getVPValue(); 8000 } 8001 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8002 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8003 8004 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8005 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8006 // as a second argument, we only pass the IV here and extract the 8007 // tripcount from the transform state where codegen of the VP instructions 8008 // happen. 8009 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8010 } else { 8011 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8012 } 8013 return BlockMaskCache[BB] = BlockMask; 8014 } 8015 8016 // This is the block mask. We OR all incoming edges. 8017 for (auto *Predecessor : predecessors(BB)) { 8018 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8019 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8020 return BlockMaskCache[BB] = EdgeMask; 8021 8022 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8023 BlockMask = EdgeMask; 8024 continue; 8025 } 8026 8027 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8028 } 8029 8030 return BlockMaskCache[BB] = BlockMask; 8031 } 8032 8033 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8034 VPlanPtr &Plan) { 8035 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8036 "Must be called with either a load or store"); 8037 8038 auto willWiden = [&](ElementCount VF) -> bool { 8039 if (VF.isScalar()) 8040 return false; 8041 LoopVectorizationCostModel::InstWidening Decision = 8042 CM.getWideningDecision(I, VF); 8043 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8044 "CM decision should be taken at this point."); 8045 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8046 return true; 8047 if (CM.isScalarAfterVectorization(I, VF) || 8048 CM.isProfitableToScalarize(I, VF)) 8049 return false; 8050 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8051 }; 8052 8053 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8054 return nullptr; 8055 8056 VPValue *Mask = nullptr; 8057 if (Legal->isMaskRequired(I)) 8058 Mask = createBlockInMask(I->getParent(), Plan); 8059 8060 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8061 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8062 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8063 8064 StoreInst *Store = cast<StoreInst>(I); 8065 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8066 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8067 } 8068 8069 VPWidenIntOrFpInductionRecipe * 8070 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 8071 // Check if this is an integer or fp induction. If so, build the recipe that 8072 // produces its scalar and vector values. 8073 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8074 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8075 II.getKind() == InductionDescriptor::IK_FpInduction) 8076 return new VPWidenIntOrFpInductionRecipe(Phi); 8077 8078 return nullptr; 8079 } 8080 8081 VPWidenIntOrFpInductionRecipe * 8082 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 8083 VFRange &Range) const { 8084 // Optimize the special case where the source is a constant integer 8085 // induction variable. Notice that we can only optimize the 'trunc' case 8086 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8087 // (c) other casts depend on pointer size. 8088 8089 // Determine whether \p K is a truncation based on an induction variable that 8090 // can be optimized. 8091 auto isOptimizableIVTruncate = 8092 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8093 return [=](ElementCount VF) -> bool { 8094 return CM.isOptimizableIVTruncate(K, VF); 8095 }; 8096 }; 8097 8098 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8099 isOptimizableIVTruncate(I), Range)) 8100 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8101 I); 8102 return nullptr; 8103 } 8104 8105 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8106 // We know that all PHIs in non-header blocks are converted into selects, so 8107 // we don't have to worry about the insertion order and we can just use the 8108 // builder. At this point we generate the predication tree. There may be 8109 // duplications since this is a simple recursive scan, but future 8110 // optimizations will clean it up. 8111 8112 SmallVector<VPValue *, 2> Operands; 8113 unsigned NumIncoming = Phi->getNumIncomingValues(); 8114 for (unsigned In = 0; In < NumIncoming; In++) { 8115 VPValue *EdgeMask = 8116 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8117 assert((EdgeMask || NumIncoming == 1) && 8118 "Multiple predecessors with one having a full mask"); 8119 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8120 if (EdgeMask) 8121 Operands.push_back(EdgeMask); 8122 } 8123 return new VPBlendRecipe(Phi, Operands); 8124 } 8125 8126 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8127 VPlan &Plan) const { 8128 8129 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8130 [this, CI](ElementCount VF) { 8131 return CM.isScalarWithPredication(CI, VF); 8132 }, 8133 Range); 8134 8135 if (IsPredicated) 8136 return nullptr; 8137 8138 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8139 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8140 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8141 ID == Intrinsic::pseudoprobe)) 8142 return nullptr; 8143 8144 auto willWiden = [&](ElementCount VF) -> bool { 8145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8146 // The following case may be scalarized depending on the VF. 8147 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8148 // version of the instruction. 8149 // Is it beneficial to perform intrinsic call compared to lib call? 8150 bool NeedToScalarize = false; 8151 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8152 bool UseVectorIntrinsic = 8153 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8154 return UseVectorIntrinsic || !NeedToScalarize; 8155 }; 8156 8157 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8158 return nullptr; 8159 8160 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8161 } 8162 8163 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8164 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8165 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8166 // Instruction should be widened, unless it is scalar after vectorization, 8167 // scalarization is profitable or it is predicated. 8168 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8169 return CM.isScalarAfterVectorization(I, VF) || 8170 CM.isProfitableToScalarize(I, VF) || 8171 CM.isScalarWithPredication(I, VF); 8172 }; 8173 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8174 Range); 8175 } 8176 8177 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8178 auto IsVectorizableOpcode = [](unsigned Opcode) { 8179 switch (Opcode) { 8180 case Instruction::Add: 8181 case Instruction::And: 8182 case Instruction::AShr: 8183 case Instruction::BitCast: 8184 case Instruction::FAdd: 8185 case Instruction::FCmp: 8186 case Instruction::FDiv: 8187 case Instruction::FMul: 8188 case Instruction::FNeg: 8189 case Instruction::FPExt: 8190 case Instruction::FPToSI: 8191 case Instruction::FPToUI: 8192 case Instruction::FPTrunc: 8193 case Instruction::FRem: 8194 case Instruction::FSub: 8195 case Instruction::ICmp: 8196 case Instruction::IntToPtr: 8197 case Instruction::LShr: 8198 case Instruction::Mul: 8199 case Instruction::Or: 8200 case Instruction::PtrToInt: 8201 case Instruction::SDiv: 8202 case Instruction::Select: 8203 case Instruction::SExt: 8204 case Instruction::Shl: 8205 case Instruction::SIToFP: 8206 case Instruction::SRem: 8207 case Instruction::Sub: 8208 case Instruction::Trunc: 8209 case Instruction::UDiv: 8210 case Instruction::UIToFP: 8211 case Instruction::URem: 8212 case Instruction::Xor: 8213 case Instruction::ZExt: 8214 return true; 8215 } 8216 return false; 8217 }; 8218 8219 if (!IsVectorizableOpcode(I->getOpcode())) 8220 return nullptr; 8221 8222 // Success: widen this instruction. 8223 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8224 } 8225 8226 VPBasicBlock *VPRecipeBuilder::handleReplication( 8227 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8228 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8229 VPlanPtr &Plan) { 8230 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8231 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8232 Range); 8233 8234 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8235 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8236 Range); 8237 8238 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8239 IsUniform, IsPredicated); 8240 setRecipe(I, Recipe); 8241 Plan->addVPValue(I, Recipe); 8242 8243 // Find if I uses a predicated instruction. If so, it will use its scalar 8244 // value. Avoid hoisting the insert-element which packs the scalar value into 8245 // a vector value, as that happens iff all users use the vector value. 8246 for (auto &Op : I->operands()) 8247 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8248 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8249 PredInst2Recipe[PredInst]->setAlsoPack(false); 8250 8251 // Finalize the recipe for Instr, first if it is not predicated. 8252 if (!IsPredicated) { 8253 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8254 VPBB->appendRecipe(Recipe); 8255 return VPBB; 8256 } 8257 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8258 assert(VPBB->getSuccessors().empty() && 8259 "VPBB has successors when handling predicated replication."); 8260 // Record predicated instructions for above packing optimizations. 8261 PredInst2Recipe[I] = Recipe; 8262 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8263 VPBlockUtils::insertBlockAfter(Region, VPBB); 8264 auto *RegSucc = new VPBasicBlock(); 8265 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8266 return RegSucc; 8267 } 8268 8269 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8270 VPRecipeBase *PredRecipe, 8271 VPlanPtr &Plan) { 8272 // Instructions marked for predication are replicated and placed under an 8273 // if-then construct to prevent side-effects. 8274 8275 // Generate recipes to compute the block mask for this region. 8276 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8277 8278 // Build the triangular if-then region. 8279 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8280 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8281 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8282 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8283 auto *PHIRecipe = Instr->getType()->isVoidTy() 8284 ? nullptr 8285 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8286 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8287 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8288 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8289 8290 // Note: first set Entry as region entry and then connect successors starting 8291 // from it in order, to propagate the "parent" of each VPBasicBlock. 8292 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8293 VPBlockUtils::connectBlocks(Pred, Exit); 8294 8295 return Region; 8296 } 8297 8298 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8299 VFRange &Range, 8300 VPlanPtr &Plan) { 8301 // First, check for specific widening recipes that deal with calls, memory 8302 // operations, inductions and Phi nodes. 8303 if (auto *CI = dyn_cast<CallInst>(Instr)) 8304 return tryToWidenCall(CI, Range, *Plan); 8305 8306 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8307 return tryToWidenMemory(Instr, Range, Plan); 8308 8309 VPRecipeBase *Recipe; 8310 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8311 if (Phi->getParent() != OrigLoop->getHeader()) 8312 return tryToBlend(Phi, Plan); 8313 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 8314 return Recipe; 8315 return new VPWidenPHIRecipe(Phi); 8316 } 8317 8318 if (isa<TruncInst>(Instr) && 8319 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 8320 return Recipe; 8321 8322 if (!shouldWiden(Instr, Range)) 8323 return nullptr; 8324 8325 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8326 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8327 OrigLoop); 8328 8329 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8330 bool InvariantCond = 8331 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8332 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8333 InvariantCond); 8334 } 8335 8336 return tryToWiden(Instr, *Plan); 8337 } 8338 8339 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8340 ElementCount MaxVF) { 8341 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8342 8343 // Collect instructions from the original loop that will become trivially dead 8344 // in the vectorized loop. We don't need to vectorize these instructions. For 8345 // example, original induction update instructions can become dead because we 8346 // separately emit induction "steps" when generating code for the new loop. 8347 // Similarly, we create a new latch condition when setting up the structure 8348 // of the new loop, so the old one can become dead. 8349 SmallPtrSet<Instruction *, 4> DeadInstructions; 8350 collectTriviallyDeadInstructions(DeadInstructions); 8351 8352 // Add assume instructions we need to drop to DeadInstructions, to prevent 8353 // them from being added to the VPlan. 8354 // TODO: We only need to drop assumes in blocks that get flattend. If the 8355 // control flow is preserved, we should keep them. 8356 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8357 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8358 8359 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8360 // Dead instructions do not need sinking. Remove them from SinkAfter. 8361 for (Instruction *I : DeadInstructions) 8362 SinkAfter.erase(I); 8363 8364 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8365 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8366 VFRange SubRange = {VF, MaxVFPlusOne}; 8367 VPlans.push_back( 8368 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8369 VF = SubRange.End; 8370 } 8371 } 8372 8373 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8374 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8375 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8376 8377 // Hold a mapping from predicated instructions to their recipes, in order to 8378 // fix their AlsoPack behavior if a user is determined to replicate and use a 8379 // scalar instead of vector value. 8380 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8381 8382 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8383 8384 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8385 8386 // --------------------------------------------------------------------------- 8387 // Pre-construction: record ingredients whose recipes we'll need to further 8388 // process after constructing the initial VPlan. 8389 // --------------------------------------------------------------------------- 8390 8391 // Mark instructions we'll need to sink later and their targets as 8392 // ingredients whose recipe we'll need to record. 8393 for (auto &Entry : SinkAfter) { 8394 RecipeBuilder.recordRecipeOf(Entry.first); 8395 RecipeBuilder.recordRecipeOf(Entry.second); 8396 } 8397 for (auto &Reduction : CM.getInLoopReductionChains()) { 8398 PHINode *Phi = Reduction.first; 8399 RecurrenceDescriptor::RecurrenceKind Kind = 8400 Legal->getReductionVars()[Phi].getRecurrenceKind(); 8401 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8402 8403 RecipeBuilder.recordRecipeOf(Phi); 8404 for (auto &R : ReductionOperations) { 8405 RecipeBuilder.recordRecipeOf(R); 8406 // For min/max reducitons, where we have a pair of icmp/select, we also 8407 // need to record the ICmp recipe, so it can be removed later. 8408 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8409 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8410 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8411 } 8412 } 8413 } 8414 8415 // For each interleave group which is relevant for this (possibly trimmed) 8416 // Range, add it to the set of groups to be later applied to the VPlan and add 8417 // placeholders for its members' Recipes which we'll be replacing with a 8418 // single VPInterleaveRecipe. 8419 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8420 auto applyIG = [IG, this](ElementCount VF) -> bool { 8421 return (VF.isVector() && // Query is illegal for VF == 1 8422 CM.getWideningDecision(IG->getInsertPos(), VF) == 8423 LoopVectorizationCostModel::CM_Interleave); 8424 }; 8425 if (!getDecisionAndClampRange(applyIG, Range)) 8426 continue; 8427 InterleaveGroups.insert(IG); 8428 for (unsigned i = 0; i < IG->getFactor(); i++) 8429 if (Instruction *Member = IG->getMember(i)) 8430 RecipeBuilder.recordRecipeOf(Member); 8431 }; 8432 8433 // --------------------------------------------------------------------------- 8434 // Build initial VPlan: Scan the body of the loop in a topological order to 8435 // visit each basic block after having visited its predecessor basic blocks. 8436 // --------------------------------------------------------------------------- 8437 8438 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8439 auto Plan = std::make_unique<VPlan>(); 8440 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8441 Plan->setEntry(VPBB); 8442 8443 // Scan the body of the loop in a topological order to visit each basic block 8444 // after having visited its predecessor basic blocks. 8445 LoopBlocksDFS DFS(OrigLoop); 8446 DFS.perform(LI); 8447 8448 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8449 // Relevant instructions from basic block BB will be grouped into VPRecipe 8450 // ingredients and fill a new VPBasicBlock. 8451 unsigned VPBBsForBB = 0; 8452 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8453 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8454 VPBB = FirstVPBBForBB; 8455 Builder.setInsertPoint(VPBB); 8456 8457 // Introduce each ingredient into VPlan. 8458 // TODO: Model and preserve debug instrinsics in VPlan. 8459 for (Instruction &I : BB->instructionsWithoutDebug()) { 8460 Instruction *Instr = &I; 8461 8462 // First filter out irrelevant instructions, to ensure no recipes are 8463 // built for them. 8464 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8465 continue; 8466 8467 if (auto Recipe = 8468 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8469 for (auto *Def : Recipe->definedValues()) { 8470 auto *UV = Def->getUnderlyingValue(); 8471 Plan->addVPValue(UV, Def); 8472 } 8473 8474 RecipeBuilder.setRecipe(Instr, Recipe); 8475 VPBB->appendRecipe(Recipe); 8476 continue; 8477 } 8478 8479 // Otherwise, if all widening options failed, Instruction is to be 8480 // replicated. This may create a successor for VPBB. 8481 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8482 Instr, Range, VPBB, PredInst2Recipe, Plan); 8483 if (NextVPBB != VPBB) { 8484 VPBB = NextVPBB; 8485 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8486 : ""); 8487 } 8488 } 8489 } 8490 8491 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8492 // may also be empty, such as the last one VPBB, reflecting original 8493 // basic-blocks with no recipes. 8494 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8495 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8496 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8497 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8498 delete PreEntry; 8499 8500 // --------------------------------------------------------------------------- 8501 // Transform initial VPlan: Apply previously taken decisions, in order, to 8502 // bring the VPlan to its final state. 8503 // --------------------------------------------------------------------------- 8504 8505 // Apply Sink-After legal constraints. 8506 for (auto &Entry : SinkAfter) { 8507 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8508 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8509 Sink->moveAfter(Target); 8510 } 8511 8512 // Interleave memory: for each Interleave Group we marked earlier as relevant 8513 // for this VPlan, replace the Recipes widening its memory instructions with a 8514 // single VPInterleaveRecipe at its insertion point. 8515 for (auto IG : InterleaveGroups) { 8516 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8517 RecipeBuilder.getRecipe(IG->getInsertPos())); 8518 SmallVector<VPValue *, 4> StoredValues; 8519 for (unsigned i = 0; i < IG->getFactor(); ++i) 8520 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8521 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8522 8523 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8524 Recipe->getMask()); 8525 VPIG->insertBefore(Recipe); 8526 unsigned J = 0; 8527 for (unsigned i = 0; i < IG->getFactor(); ++i) 8528 if (Instruction *Member = IG->getMember(i)) { 8529 if (!Member->getType()->isVoidTy()) { 8530 VPValue *OriginalV = Plan->getVPValue(Member); 8531 Plan->removeVPValueFor(Member); 8532 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8533 J++; 8534 } 8535 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8536 } 8537 } 8538 8539 // Adjust the recipes for any inloop reductions. 8540 if (Range.Start.isVector()) 8541 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8542 8543 // Finally, if tail is folded by masking, introduce selects between the phi 8544 // and the live-out instruction of each reduction, at the end of the latch. 8545 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8546 Builder.setInsertPoint(VPBB); 8547 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8548 for (auto &Reduction : Legal->getReductionVars()) { 8549 if (CM.isInLoopReduction(Reduction.first)) 8550 continue; 8551 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8552 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8553 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8554 } 8555 } 8556 8557 std::string PlanName; 8558 raw_string_ostream RSO(PlanName); 8559 ElementCount VF = Range.Start; 8560 Plan->addVF(VF); 8561 RSO << "Initial VPlan for VF={" << VF; 8562 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8563 Plan->addVF(VF); 8564 RSO << "," << VF; 8565 } 8566 RSO << "},UF>=1"; 8567 RSO.flush(); 8568 Plan->setName(PlanName); 8569 8570 return Plan; 8571 } 8572 8573 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8574 // Outer loop handling: They may require CFG and instruction level 8575 // transformations before even evaluating whether vectorization is profitable. 8576 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8577 // the vectorization pipeline. 8578 assert(!OrigLoop->isInnermost()); 8579 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8580 8581 // Create new empty VPlan 8582 auto Plan = std::make_unique<VPlan>(); 8583 8584 // Build hierarchical CFG 8585 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8586 HCFGBuilder.buildHierarchicalCFG(); 8587 8588 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8589 VF *= 2) 8590 Plan->addVF(VF); 8591 8592 if (EnableVPlanPredication) { 8593 VPlanPredicator VPP(*Plan); 8594 VPP.predicate(); 8595 8596 // Avoid running transformation to recipes until masked code generation in 8597 // VPlan-native path is in place. 8598 return Plan; 8599 } 8600 8601 SmallPtrSet<Instruction *, 1> DeadInstructions; 8602 VPlanTransforms::VPInstructionsToVPRecipes( 8603 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8604 return Plan; 8605 } 8606 8607 // Adjust the recipes for any inloop reductions. The chain of instructions 8608 // leading from the loop exit instr to the phi need to be converted to 8609 // reductions, with one operand being vector and the other being the scalar 8610 // reduction chain. 8611 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8612 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8613 for (auto &Reduction : CM.getInLoopReductionChains()) { 8614 PHINode *Phi = Reduction.first; 8615 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8616 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8617 8618 // ReductionOperations are orders top-down from the phi's use to the 8619 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8620 // which of the two operands will remain scalar and which will be reduced. 8621 // For minmax the chain will be the select instructions. 8622 Instruction *Chain = Phi; 8623 for (Instruction *R : ReductionOperations) { 8624 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8625 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 8626 8627 VPValue *ChainOp = Plan->getVPValue(Chain); 8628 unsigned FirstOpId; 8629 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8630 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8631 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8632 "Expected to replace a VPWidenSelectSC"); 8633 FirstOpId = 1; 8634 } else { 8635 assert(isa<VPWidenRecipe>(WidenRecipe) && 8636 "Expected to replace a VPWidenSC"); 8637 FirstOpId = 0; 8638 } 8639 unsigned VecOpId = 8640 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8641 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8642 8643 auto *CondOp = CM.foldTailByMasking() 8644 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8645 : nullptr; 8646 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8647 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8648 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8649 Plan->removeVPValueFor(R); 8650 Plan->addVPValue(R, RedRecipe); 8651 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8652 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8653 WidenRecipe->eraseFromParent(); 8654 8655 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8656 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8657 VPRecipeBase *CompareRecipe = 8658 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8659 assert(isa<VPWidenRecipe>(CompareRecipe) && 8660 "Expected to replace a VPWidenSC"); 8661 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8662 "Expected no remaining users"); 8663 CompareRecipe->eraseFromParent(); 8664 } 8665 Chain = R; 8666 } 8667 } 8668 } 8669 8670 Value* LoopVectorizationPlanner::VPCallbackILV:: 8671 getOrCreateVectorValues(Value *V, unsigned Part) { 8672 return ILV.getOrCreateVectorValue(V, Part); 8673 } 8674 8675 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8676 Value *V, const VPIteration &Instance) { 8677 return ILV.getOrCreateScalarValue(V, Instance); 8678 } 8679 8680 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8681 VPSlotTracker &SlotTracker) const { 8682 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8683 IG->getInsertPos()->printAsOperand(O, false); 8684 O << ", "; 8685 getAddr()->printAsOperand(O, SlotTracker); 8686 VPValue *Mask = getMask(); 8687 if (Mask) { 8688 O << ", "; 8689 Mask->printAsOperand(O, SlotTracker); 8690 } 8691 for (unsigned i = 0; i < IG->getFactor(); ++i) 8692 if (Instruction *I = IG->getMember(i)) 8693 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8694 } 8695 8696 void VPWidenCallRecipe::execute(VPTransformState &State) { 8697 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8698 *this, State); 8699 } 8700 8701 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8702 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8703 this, *this, InvariantCond, State); 8704 } 8705 8706 void VPWidenRecipe::execute(VPTransformState &State) { 8707 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8708 } 8709 8710 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8711 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8712 *this, State.UF, State.VF, IsPtrLoopInvariant, 8713 IsIndexLoopInvariant, State); 8714 } 8715 8716 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8717 assert(!State.Instance && "Int or FP induction being replicated."); 8718 State.ILV->widenIntOrFpInduction(IV, Trunc); 8719 } 8720 8721 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8722 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8723 } 8724 8725 void VPBlendRecipe::execute(VPTransformState &State) { 8726 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8727 // We know that all PHIs in non-header blocks are converted into 8728 // selects, so we don't have to worry about the insertion order and we 8729 // can just use the builder. 8730 // At this point we generate the predication tree. There may be 8731 // duplications since this is a simple recursive scan, but future 8732 // optimizations will clean it up. 8733 8734 unsigned NumIncoming = getNumIncomingValues(); 8735 8736 // Generate a sequence of selects of the form: 8737 // SELECT(Mask3, In3, 8738 // SELECT(Mask2, In2, 8739 // SELECT(Mask1, In1, 8740 // In0))) 8741 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8742 // are essentially undef are taken from In0. 8743 InnerLoopVectorizer::VectorParts Entry(State.UF); 8744 for (unsigned In = 0; In < NumIncoming; ++In) { 8745 for (unsigned Part = 0; Part < State.UF; ++Part) { 8746 // We might have single edge PHIs (blocks) - use an identity 8747 // 'select' for the first PHI operand. 8748 Value *In0 = State.get(getIncomingValue(In), Part); 8749 if (In == 0) 8750 Entry[Part] = In0; // Initialize with the first incoming value. 8751 else { 8752 // Select between the current value and the previous incoming edge 8753 // based on the incoming mask. 8754 Value *Cond = State.get(getMask(In), Part); 8755 Entry[Part] = 8756 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8757 } 8758 } 8759 } 8760 for (unsigned Part = 0; Part < State.UF; ++Part) 8761 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8762 } 8763 8764 void VPInterleaveRecipe::execute(VPTransformState &State) { 8765 assert(!State.Instance && "Interleave group being replicated."); 8766 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8767 getStoredValues(), getMask()); 8768 } 8769 8770 void VPReductionRecipe::execute(VPTransformState &State) { 8771 assert(!State.Instance && "Reduction being replicated."); 8772 for (unsigned Part = 0; Part < State.UF; ++Part) { 8773 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8774 Value *NewVecOp = State.get(getVecOp(), Part); 8775 if (VPValue *Cond = getCondOp()) { 8776 Value *NewCond = State.get(Cond, Part); 8777 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8778 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8779 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8780 Constant *IdenVec = 8781 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8782 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8783 NewVecOp = Select; 8784 } 8785 Value *NewRed = 8786 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8787 Value *PrevInChain = State.get(getChainOp(), Part); 8788 Value *NextInChain; 8789 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8790 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8791 NextInChain = 8792 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8793 NewRed, PrevInChain); 8794 } else { 8795 NextInChain = State.Builder.CreateBinOp( 8796 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8797 PrevInChain); 8798 } 8799 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8800 } 8801 } 8802 8803 void VPReplicateRecipe::execute(VPTransformState &State) { 8804 if (State.Instance) { // Generate a single instance. 8805 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8806 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8807 *State.Instance, IsPredicated, State); 8808 // Insert scalar instance packing it into a vector. 8809 if (AlsoPack && State.VF.isVector()) { 8810 // If we're constructing lane 0, initialize to start from undef. 8811 if (State.Instance->Lane == 0) { 8812 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8813 Value *Undef = UndefValue::get( 8814 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8815 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8816 State.Instance->Part, Undef); 8817 } 8818 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8819 *State.Instance); 8820 } 8821 return; 8822 } 8823 8824 // Generate scalar instances for all VF lanes of all UF parts, unless the 8825 // instruction is uniform inwhich case generate only the first lane for each 8826 // of the UF parts. 8827 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8828 assert((!State.VF.isScalable() || IsUniform) && 8829 "Can't scalarize a scalable vector"); 8830 for (unsigned Part = 0; Part < State.UF; ++Part) 8831 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8832 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8833 IsPredicated, State); 8834 } 8835 8836 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8837 assert(State.Instance && "Branch on Mask works only on single instance."); 8838 8839 unsigned Part = State.Instance->Part; 8840 unsigned Lane = State.Instance->Lane; 8841 8842 Value *ConditionBit = nullptr; 8843 VPValue *BlockInMask = getMask(); 8844 if (BlockInMask) { 8845 ConditionBit = State.get(BlockInMask, Part); 8846 if (ConditionBit->getType()->isVectorTy()) 8847 ConditionBit = State.Builder.CreateExtractElement( 8848 ConditionBit, State.Builder.getInt32(Lane)); 8849 } else // Block in mask is all-one. 8850 ConditionBit = State.Builder.getTrue(); 8851 8852 // Replace the temporary unreachable terminator with a new conditional branch, 8853 // whose two destinations will be set later when they are created. 8854 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8855 assert(isa<UnreachableInst>(CurrentTerminator) && 8856 "Expected to replace unreachable terminator with conditional branch."); 8857 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8858 CondBr->setSuccessor(0, nullptr); 8859 ReplaceInstWithInst(CurrentTerminator, CondBr); 8860 } 8861 8862 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8863 assert(State.Instance && "Predicated instruction PHI works per instance."); 8864 Instruction *ScalarPredInst = 8865 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8866 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8867 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8868 assert(PredicatingBB && "Predicated block has no single predecessor."); 8869 8870 // By current pack/unpack logic we need to generate only a single phi node: if 8871 // a vector value for the predicated instruction exists at this point it means 8872 // the instruction has vector users only, and a phi for the vector value is 8873 // needed. In this case the recipe of the predicated instruction is marked to 8874 // also do that packing, thereby "hoisting" the insert-element sequence. 8875 // Otherwise, a phi node for the scalar value is needed. 8876 unsigned Part = State.Instance->Part; 8877 Instruction *PredInst = 8878 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8879 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8880 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8881 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8882 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8883 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8884 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8885 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8886 } else { 8887 Type *PredInstType = PredInst->getType(); 8888 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8889 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8890 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8891 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8892 } 8893 } 8894 8895 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8896 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8897 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8898 StoredValue ? nullptr : getVPValue(), 8899 getAddr(), StoredValue, getMask()); 8900 } 8901 8902 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8903 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8904 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8905 // for predication. 8906 static ScalarEpilogueLowering getScalarEpilogueLowering( 8907 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8908 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8909 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8910 LoopVectorizationLegality &LVL) { 8911 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8912 // don't look at hints or options, and don't request a scalar epilogue. 8913 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8914 // LoopAccessInfo (due to code dependency and not being able to reliably get 8915 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8916 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8917 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8918 // back to the old way and vectorize with versioning when forced. See D81345.) 8919 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8920 PGSOQueryType::IRPass) && 8921 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8922 return CM_ScalarEpilogueNotAllowedOptSize; 8923 8924 // 2) If set, obey the directives 8925 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 8926 switch (PreferPredicateOverEpilogue) { 8927 case PreferPredicateTy::ScalarEpilogue: 8928 return CM_ScalarEpilogueAllowed; 8929 case PreferPredicateTy::PredicateElseScalarEpilogue: 8930 return CM_ScalarEpilogueNotNeededUsePredicate; 8931 case PreferPredicateTy::PredicateOrDontVectorize: 8932 return CM_ScalarEpilogueNotAllowedUsePredicate; 8933 }; 8934 } 8935 8936 // 3) If set, obey the hints 8937 switch (Hints.getPredicate()) { 8938 case LoopVectorizeHints::FK_Enabled: 8939 return CM_ScalarEpilogueNotNeededUsePredicate; 8940 case LoopVectorizeHints::FK_Disabled: 8941 return CM_ScalarEpilogueAllowed; 8942 }; 8943 8944 // 4) if the TTI hook indicates this is profitable, request predication. 8945 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8946 LVL.getLAI())) 8947 return CM_ScalarEpilogueNotNeededUsePredicate; 8948 8949 return CM_ScalarEpilogueAllowed; 8950 } 8951 8952 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8953 unsigned Part) { 8954 set(Def, V, Part); 8955 ILV->setVectorValue(IRDef, Part, V); 8956 } 8957 8958 // Process the loop in the VPlan-native vectorization path. This path builds 8959 // VPlan upfront in the vectorization pipeline, which allows to apply 8960 // VPlan-to-VPlan transformations from the very beginning without modifying the 8961 // input LLVM IR. 8962 static bool processLoopInVPlanNativePath( 8963 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8964 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8965 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8966 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8967 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8968 8969 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8970 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8971 return false; 8972 } 8973 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8974 Function *F = L->getHeader()->getParent(); 8975 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8976 8977 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8978 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8979 8980 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8981 &Hints, IAI); 8982 // Use the planner for outer loop vectorization. 8983 // TODO: CM is not used at this point inside the planner. Turn CM into an 8984 // optional argument if we don't need it in the future. 8985 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8986 8987 // Get user vectorization factor. 8988 ElementCount UserVF = Hints.getWidth(); 8989 8990 // Plan how to best vectorize, return the best VF and its cost. 8991 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8992 8993 // If we are stress testing VPlan builds, do not attempt to generate vector 8994 // code. Masked vector code generation support will follow soon. 8995 // Also, do not attempt to vectorize if no vector code will be produced. 8996 if (VPlanBuildStressTest || EnableVPlanPredication || 8997 VectorizationFactor::Disabled() == VF) 8998 return false; 8999 9000 LVP.setBestPlan(VF.Width, 1); 9001 9002 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9003 &CM, BFI, PSI); 9004 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9005 << L->getHeader()->getParent()->getName() << "\"\n"); 9006 LVP.executePlan(LB, DT); 9007 9008 // Mark the loop as already vectorized to avoid vectorizing again. 9009 Hints.setAlreadyVectorized(); 9010 9011 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9012 return true; 9013 } 9014 9015 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9016 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9017 !EnableLoopInterleaving), 9018 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9019 !EnableLoopVectorization) {} 9020 9021 bool LoopVectorizePass::processLoop(Loop *L) { 9022 assert((EnableVPlanNativePath || L->isInnermost()) && 9023 "VPlan-native path is not enabled. Only process inner loops."); 9024 9025 #ifndef NDEBUG 9026 const std::string DebugLocStr = getDebugLocString(L); 9027 #endif /* NDEBUG */ 9028 9029 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9030 << L->getHeader()->getParent()->getName() << "\" from " 9031 << DebugLocStr << "\n"); 9032 9033 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9034 9035 LLVM_DEBUG( 9036 dbgs() << "LV: Loop hints:" 9037 << " force=" 9038 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9039 ? "disabled" 9040 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9041 ? "enabled" 9042 : "?")) 9043 << " width=" << Hints.getWidth() 9044 << " unroll=" << Hints.getInterleave() << "\n"); 9045 9046 // Function containing loop 9047 Function *F = L->getHeader()->getParent(); 9048 9049 // Looking at the diagnostic output is the only way to determine if a loop 9050 // was vectorized (other than looking at the IR or machine code), so it 9051 // is important to generate an optimization remark for each loop. Most of 9052 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9053 // generated as OptimizationRemark and OptimizationRemarkMissed are 9054 // less verbose reporting vectorized loops and unvectorized loops that may 9055 // benefit from vectorization, respectively. 9056 9057 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9058 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9059 return false; 9060 } 9061 9062 PredicatedScalarEvolution PSE(*SE, *L); 9063 9064 // Check if it is legal to vectorize the loop. 9065 LoopVectorizationRequirements Requirements(*ORE); 9066 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9067 &Requirements, &Hints, DB, AC, BFI, PSI); 9068 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9069 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9070 Hints.emitRemarkWithHints(); 9071 return false; 9072 } 9073 9074 // Check the function attributes and profiles to find out if this function 9075 // should be optimized for size. 9076 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9077 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9078 9079 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9080 // here. They may require CFG and instruction level transformations before 9081 // even evaluating whether vectorization is profitable. Since we cannot modify 9082 // the incoming IR, we need to build VPlan upfront in the vectorization 9083 // pipeline. 9084 if (!L->isInnermost()) 9085 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9086 ORE, BFI, PSI, Hints); 9087 9088 assert(L->isInnermost() && "Inner loop expected."); 9089 9090 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9091 // count by optimizing for size, to minimize overheads. 9092 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9093 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9094 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9095 << "This loop is worth vectorizing only if no scalar " 9096 << "iteration overheads are incurred."); 9097 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9098 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9099 else { 9100 LLVM_DEBUG(dbgs() << "\n"); 9101 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9102 } 9103 } 9104 9105 // Check the function attributes to see if implicit floats are allowed. 9106 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9107 // an integer loop and the vector instructions selected are purely integer 9108 // vector instructions? 9109 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9110 reportVectorizationFailure( 9111 "Can't vectorize when the NoImplicitFloat attribute is used", 9112 "loop not vectorized due to NoImplicitFloat attribute", 9113 "NoImplicitFloat", ORE, L); 9114 Hints.emitRemarkWithHints(); 9115 return false; 9116 } 9117 9118 // Check if the target supports potentially unsafe FP vectorization. 9119 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9120 // for the target we're vectorizing for, to make sure none of the 9121 // additional fp-math flags can help. 9122 if (Hints.isPotentiallyUnsafe() && 9123 TTI->isFPVectorizationPotentiallyUnsafe()) { 9124 reportVectorizationFailure( 9125 "Potentially unsafe FP op prevents vectorization", 9126 "loop not vectorized due to unsafe FP support.", 9127 "UnsafeFP", ORE, L); 9128 Hints.emitRemarkWithHints(); 9129 return false; 9130 } 9131 9132 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9133 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9134 9135 // If an override option has been passed in for interleaved accesses, use it. 9136 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9137 UseInterleaved = EnableInterleavedMemAccesses; 9138 9139 // Analyze interleaved memory accesses. 9140 if (UseInterleaved) { 9141 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9142 } 9143 9144 // Use the cost model. 9145 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9146 F, &Hints, IAI); 9147 CM.collectValuesToIgnore(); 9148 9149 // Use the planner for vectorization. 9150 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9151 9152 // Get user vectorization factor and interleave count. 9153 ElementCount UserVF = Hints.getWidth(); 9154 unsigned UserIC = Hints.getInterleave(); 9155 9156 // Plan how to best vectorize, return the best VF and its cost. 9157 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9158 9159 VectorizationFactor VF = VectorizationFactor::Disabled(); 9160 unsigned IC = 1; 9161 9162 if (MaybeVF) { 9163 VF = *MaybeVF; 9164 // Select the interleave count. 9165 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9166 } 9167 9168 // Identify the diagnostic messages that should be produced. 9169 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9170 bool VectorizeLoop = true, InterleaveLoop = true; 9171 if (Requirements.doesNotMeet(F, L, Hints)) { 9172 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9173 "requirements.\n"); 9174 Hints.emitRemarkWithHints(); 9175 return false; 9176 } 9177 9178 if (VF.Width.isScalar()) { 9179 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9180 VecDiagMsg = std::make_pair( 9181 "VectorizationNotBeneficial", 9182 "the cost-model indicates that vectorization is not beneficial"); 9183 VectorizeLoop = false; 9184 } 9185 9186 if (!MaybeVF && UserIC > 1) { 9187 // Tell the user interleaving was avoided up-front, despite being explicitly 9188 // requested. 9189 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9190 "interleaving should be avoided up front\n"); 9191 IntDiagMsg = std::make_pair( 9192 "InterleavingAvoided", 9193 "Ignoring UserIC, because interleaving was avoided up front"); 9194 InterleaveLoop = false; 9195 } else if (IC == 1 && UserIC <= 1) { 9196 // Tell the user interleaving is not beneficial. 9197 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9198 IntDiagMsg = std::make_pair( 9199 "InterleavingNotBeneficial", 9200 "the cost-model indicates that interleaving is not beneficial"); 9201 InterleaveLoop = false; 9202 if (UserIC == 1) { 9203 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9204 IntDiagMsg.second += 9205 " and is explicitly disabled or interleave count is set to 1"; 9206 } 9207 } else if (IC > 1 && UserIC == 1) { 9208 // Tell the user interleaving is beneficial, but it explicitly disabled. 9209 LLVM_DEBUG( 9210 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9211 IntDiagMsg = std::make_pair( 9212 "InterleavingBeneficialButDisabled", 9213 "the cost-model indicates that interleaving is beneficial " 9214 "but is explicitly disabled or interleave count is set to 1"); 9215 InterleaveLoop = false; 9216 } 9217 9218 // Override IC if user provided an interleave count. 9219 IC = UserIC > 0 ? UserIC : IC; 9220 9221 // Emit diagnostic messages, if any. 9222 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9223 if (!VectorizeLoop && !InterleaveLoop) { 9224 // Do not vectorize or interleaving the loop. 9225 ORE->emit([&]() { 9226 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9227 L->getStartLoc(), L->getHeader()) 9228 << VecDiagMsg.second; 9229 }); 9230 ORE->emit([&]() { 9231 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9232 L->getStartLoc(), L->getHeader()) 9233 << IntDiagMsg.second; 9234 }); 9235 return false; 9236 } else if (!VectorizeLoop && InterleaveLoop) { 9237 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9238 ORE->emit([&]() { 9239 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9240 L->getStartLoc(), L->getHeader()) 9241 << VecDiagMsg.second; 9242 }); 9243 } else if (VectorizeLoop && !InterleaveLoop) { 9244 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9245 << ") in " << DebugLocStr << '\n'); 9246 ORE->emit([&]() { 9247 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9248 L->getStartLoc(), L->getHeader()) 9249 << IntDiagMsg.second; 9250 }); 9251 } else if (VectorizeLoop && InterleaveLoop) { 9252 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9253 << ") in " << DebugLocStr << '\n'); 9254 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9255 } 9256 9257 LVP.setBestPlan(VF.Width, IC); 9258 9259 using namespace ore; 9260 bool DisableRuntimeUnroll = false; 9261 MDNode *OrigLoopID = L->getLoopID(); 9262 9263 if (!VectorizeLoop) { 9264 assert(IC > 1 && "interleave count should not be 1 or 0"); 9265 // If we decided that it is not legal to vectorize the loop, then 9266 // interleave it. 9267 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9268 BFI, PSI); 9269 LVP.executePlan(Unroller, DT); 9270 9271 ORE->emit([&]() { 9272 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9273 L->getHeader()) 9274 << "interleaved loop (interleaved count: " 9275 << NV("InterleaveCount", IC) << ")"; 9276 }); 9277 } else { 9278 // If we decided that it is *legal* to vectorize the loop, then do it. 9279 9280 // Consider vectorizing the epilogue too if it's profitable. 9281 VectorizationFactor EpilogueVF = 9282 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9283 if (EpilogueVF.Width.isVector()) { 9284 9285 // The first pass vectorizes the main loop and creates a scalar epilogue 9286 // to be vectorized by executing the plan (potentially with a different 9287 // factor) again shortly afterwards. 9288 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9289 EpilogueVF.Width.getKnownMinValue(), 1); 9290 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9291 &LVL, &CM, BFI, PSI); 9292 9293 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9294 LVP.executePlan(MainILV, DT); 9295 ++LoopsVectorized; 9296 9297 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9298 formLCSSARecursively(*L, *DT, LI, SE); 9299 9300 // Second pass vectorizes the epilogue and adjusts the control flow 9301 // edges from the first pass. 9302 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9303 EPI.MainLoopVF = EPI.EpilogueVF; 9304 EPI.MainLoopUF = EPI.EpilogueUF; 9305 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9306 ORE, EPI, &LVL, &CM, BFI, PSI); 9307 LVP.executePlan(EpilogILV, DT); 9308 ++LoopsEpilogueVectorized; 9309 9310 if (!MainILV.areSafetyChecksAdded()) 9311 DisableRuntimeUnroll = true; 9312 } else { 9313 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9314 &LVL, &CM, BFI, PSI); 9315 LVP.executePlan(LB, DT); 9316 ++LoopsVectorized; 9317 9318 // Add metadata to disable runtime unrolling a scalar loop when there are 9319 // no runtime checks about strides and memory. A scalar loop that is 9320 // rarely used is not worth unrolling. 9321 if (!LB.areSafetyChecksAdded()) 9322 DisableRuntimeUnroll = true; 9323 } 9324 9325 // Report the vectorization decision. 9326 ORE->emit([&]() { 9327 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9328 L->getHeader()) 9329 << "vectorized loop (vectorization width: " 9330 << NV("VectorizationFactor", VF.Width) 9331 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9332 }); 9333 } 9334 9335 Optional<MDNode *> RemainderLoopID = 9336 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9337 LLVMLoopVectorizeFollowupEpilogue}); 9338 if (RemainderLoopID.hasValue()) { 9339 L->setLoopID(RemainderLoopID.getValue()); 9340 } else { 9341 if (DisableRuntimeUnroll) 9342 AddRuntimeUnrollDisableMetaData(L); 9343 9344 // Mark the loop as already vectorized to avoid vectorizing again. 9345 Hints.setAlreadyVectorized(); 9346 } 9347 9348 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9349 return true; 9350 } 9351 9352 LoopVectorizeResult LoopVectorizePass::runImpl( 9353 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9354 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9355 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9356 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9357 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9358 SE = &SE_; 9359 LI = &LI_; 9360 TTI = &TTI_; 9361 DT = &DT_; 9362 BFI = &BFI_; 9363 TLI = TLI_; 9364 AA = &AA_; 9365 AC = &AC_; 9366 GetLAA = &GetLAA_; 9367 DB = &DB_; 9368 ORE = &ORE_; 9369 PSI = PSI_; 9370 9371 // Don't attempt if 9372 // 1. the target claims to have no vector registers, and 9373 // 2. interleaving won't help ILP. 9374 // 9375 // The second condition is necessary because, even if the target has no 9376 // vector registers, loop vectorization may still enable scalar 9377 // interleaving. 9378 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9379 TTI->getMaxInterleaveFactor(1) < 2) 9380 return LoopVectorizeResult(false, false); 9381 9382 bool Changed = false, CFGChanged = false; 9383 9384 // The vectorizer requires loops to be in simplified form. 9385 // Since simplification may add new inner loops, it has to run before the 9386 // legality and profitability checks. This means running the loop vectorizer 9387 // will simplify all loops, regardless of whether anything end up being 9388 // vectorized. 9389 for (auto &L : *LI) 9390 Changed |= CFGChanged |= 9391 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9392 9393 // Build up a worklist of inner-loops to vectorize. This is necessary as 9394 // the act of vectorizing or partially unrolling a loop creates new loops 9395 // and can invalidate iterators across the loops. 9396 SmallVector<Loop *, 8> Worklist; 9397 9398 for (Loop *L : *LI) 9399 collectSupportedLoops(*L, LI, ORE, Worklist); 9400 9401 LoopsAnalyzed += Worklist.size(); 9402 9403 // Now walk the identified inner loops. 9404 while (!Worklist.empty()) { 9405 Loop *L = Worklist.pop_back_val(); 9406 9407 // For the inner loops we actually process, form LCSSA to simplify the 9408 // transform. 9409 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9410 9411 Changed |= CFGChanged |= processLoop(L); 9412 } 9413 9414 // Process each loop nest in the function. 9415 return LoopVectorizeResult(Changed, CFGChanged); 9416 } 9417 9418 PreservedAnalyses LoopVectorizePass::run(Function &F, 9419 FunctionAnalysisManager &AM) { 9420 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9421 auto &LI = AM.getResult<LoopAnalysis>(F); 9422 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9423 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9424 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9425 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9426 auto &AA = AM.getResult<AAManager>(F); 9427 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9428 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9429 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9430 MemorySSA *MSSA = EnableMSSALoopDependency 9431 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9432 : nullptr; 9433 9434 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9435 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9436 [&](Loop &L) -> const LoopAccessInfo & { 9437 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9438 TLI, TTI, nullptr, MSSA}; 9439 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9440 }; 9441 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9442 ProfileSummaryInfo *PSI = 9443 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9444 LoopVectorizeResult Result = 9445 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9446 if (!Result.MadeAnyChange) 9447 return PreservedAnalyses::all(); 9448 PreservedAnalyses PA; 9449 9450 // We currently do not preserve loopinfo/dominator analyses with outer loop 9451 // vectorization. Until this is addressed, mark these analyses as preserved 9452 // only for non-VPlan-native path. 9453 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9454 if (!EnableVPlanNativePath) { 9455 PA.preserve<LoopAnalysis>(); 9456 PA.preserve<DominatorTreeAnalysis>(); 9457 } 9458 PA.preserve<BasicAA>(); 9459 PA.preserve<GlobalsAA>(); 9460 if (!Result.MadeCFGChange) 9461 PA.preserveSet<CFGAnalyses>(); 9462 return PA; 9463 } 9464