1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// Interleave small loops with scalar reductions. 274 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 276 cl::desc("Enable interleaving for loops with small iteration counts that " 277 "contain scalar reductions to expose ILP.")); 278 279 /// The number of stores in a loop that are allowed to need predication. 280 static cl::opt<unsigned> NumberOfStoresToPredicate( 281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 282 cl::desc("Max number of stores to be predicated behind an if.")); 283 284 static cl::opt<bool> EnableIndVarRegisterHeur( 285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 286 cl::desc("Count the induction variable only once when interleaving")); 287 288 static cl::opt<bool> EnableCondStoresVectorization( 289 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 290 cl::desc("Enable if predication of stores during vectorization.")); 291 292 static cl::opt<unsigned> MaxNestedScalarReductionIC( 293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 294 cl::desc("The maximum interleave count to use when interleaving a scalar " 295 "reduction in a nested loop.")); 296 297 static cl::opt<bool> 298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 299 cl::Hidden, 300 cl::desc("Prefer in-loop vector reductions, " 301 "overriding the targets preference.")); 302 303 static cl::opt<bool> PreferPredicatedReductionSelect( 304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 305 cl::desc( 306 "Prefer predicating a reduction operation over an after loop select.")); 307 308 cl::opt<bool> EnableVPlanNativePath( 309 "enable-vplan-native-path", cl::init(false), cl::Hidden, 310 cl::desc("Enable VPlan-native vectorization path with " 311 "support for outer loop vectorization.")); 312 313 // FIXME: Remove this switch once we have divergence analysis. Currently we 314 // assume divergent non-backedge branches when this switch is true. 315 cl::opt<bool> EnableVPlanPredication( 316 "enable-vplan-predication", cl::init(false), cl::Hidden, 317 cl::desc("Enable VPlan-native vectorization path predicator with " 318 "support for outer loop vectorization.")); 319 320 // This flag enables the stress testing of the VPlan H-CFG construction in the 321 // VPlan-native vectorization path. It must be used in conjuction with 322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 323 // verification of the H-CFGs built. 324 static cl::opt<bool> VPlanBuildStressTest( 325 "vplan-build-stress-test", cl::init(false), cl::Hidden, 326 cl::desc( 327 "Build VPlan for every supported loop nest in the function and bail " 328 "out right after the build (stress test the VPlan H-CFG construction " 329 "in the VPlan-native vectorization path).")); 330 331 cl::opt<bool> llvm::EnableLoopInterleaving( 332 "interleave-loops", cl::init(true), cl::Hidden, 333 cl::desc("Enable loop interleaving in Loop vectorization passes")); 334 cl::opt<bool> llvm::EnableLoopVectorization( 335 "vectorize-loops", cl::init(true), cl::Hidden, 336 cl::desc("Run the Loop vectorization passes")); 337 338 /// A helper function that returns the type of loaded or stored value. 339 static Type *getMemInstValueType(Value *I) { 340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 341 "Expected Load or Store instruction"); 342 if (auto *LI = dyn_cast<LoadInst>(I)) 343 return LI->getType(); 344 return cast<StoreInst>(I)->getValueOperand()->getType(); 345 } 346 347 /// A helper function that returns true if the given type is irregular. The 348 /// type is irregular if its allocated size doesn't equal the store size of an 349 /// element of the corresponding vector type at the given vectorization factor. 350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 351 assert(!VF.isScalable() && "scalable vectors not yet supported."); 352 // Determine if an array of VF elements of type Ty is "bitcast compatible" 353 // with a <VF x Ty> vector. 354 if (VF.isVector()) { 355 auto *VectorTy = VectorType::get(Ty, VF); 356 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 357 } 358 359 // If the vectorization factor is one, we just check if an array of type Ty 360 // requires padding between elements. 361 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 362 } 363 364 /// A helper function that returns the reciprocal of the block probability of 365 /// predicated blocks. If we return X, we are assuming the predicated block 366 /// will execute once for every X iterations of the loop header. 367 /// 368 /// TODO: We should use actual block probability here, if available. Currently, 369 /// we always assume predicated blocks have a 50% chance of executing. 370 static unsigned getReciprocalPredBlockProb() { return 2; } 371 372 /// A helper function that adds a 'fast' flag to floating-point operations. 373 static Value *addFastMathFlag(Value *V) { 374 if (isa<FPMathOperator>(V)) 375 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 376 return V; 377 } 378 379 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 380 if (isa<FPMathOperator>(V)) 381 cast<Instruction>(V)->setFastMathFlags(FMF); 382 return V; 383 } 384 385 /// A helper function that returns an integer or floating-point constant with 386 /// value C. 387 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 388 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 389 : ConstantFP::get(Ty, C); 390 } 391 392 /// Returns "best known" trip count for the specified loop \p L as defined by 393 /// the following procedure: 394 /// 1) Returns exact trip count if it is known. 395 /// 2) Returns expected trip count according to profile data if any. 396 /// 3) Returns upper bound estimate if it is known. 397 /// 4) Returns None if all of the above failed. 398 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 399 // Check if exact trip count is known. 400 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 401 return ExpectedTC; 402 403 // Check if there is an expected trip count available from profile data. 404 if (LoopVectorizeWithBlockFrequency) 405 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 406 return EstimatedTC; 407 408 // Check if upper bound estimate is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 410 return ExpectedTC; 411 412 return None; 413 } 414 415 namespace llvm { 416 417 /// InnerLoopVectorizer vectorizes loops which contain only one basic 418 /// block to a specified vectorization factor (VF). 419 /// This class performs the widening of scalars into vectors, or multiple 420 /// scalars. This class also implements the following features: 421 /// * It inserts an epilogue loop for handling loops that don't have iteration 422 /// counts that are known to be a multiple of the vectorization factor. 423 /// * It handles the code generation for reduction variables. 424 /// * Scalarization (implementation using scalars) of un-vectorizable 425 /// instructions. 426 /// InnerLoopVectorizer does not perform any vectorization-legality 427 /// checks, and relies on the caller to check for the different legality 428 /// aspects. The InnerLoopVectorizer relies on the 429 /// LoopVectorizationLegality class to provide information about the induction 430 /// and reduction variables that were found to a given vectorization factor. 431 class InnerLoopVectorizer { 432 public: 433 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 434 LoopInfo *LI, DominatorTree *DT, 435 const TargetLibraryInfo *TLI, 436 const TargetTransformInfo *TTI, AssumptionCache *AC, 437 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 438 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 439 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 440 ProfileSummaryInfo *PSI) 441 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 442 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 443 Builder(PSE.getSE()->getContext()), 444 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 445 BFI(BFI), PSI(PSI) { 446 // Query this against the original loop and save it here because the profile 447 // of the original loop header may change as the transformation happens. 448 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 449 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 450 } 451 452 virtual ~InnerLoopVectorizer() = default; 453 454 /// Create a new empty loop that will contain vectorized instructions later 455 /// on, while the old loop will be used as the scalar remainder. Control flow 456 /// is generated around the vectorized (and scalar epilogue) loops consisting 457 /// of various checks and bypasses. Return the pre-header block of the new 458 /// loop. 459 BasicBlock *createVectorizedLoopSkeleton(); 460 461 /// Widen a single instruction within the innermost loop. 462 void widenInstruction(Instruction &I, VPUser &Operands, 463 VPTransformState &State); 464 465 /// Widen a single call instruction within the innermost loop. 466 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 467 VPTransformState &State); 468 469 /// Widen a single select instruction within the innermost loop. 470 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 471 bool InvariantCond, VPTransformState &State); 472 473 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 474 void fixVectorizedLoop(); 475 476 // Return true if any runtime check is added. 477 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 478 479 /// A type for vectorized values in the new loop. Each value from the 480 /// original loop, when vectorized, is represented by UF vector values in the 481 /// new unrolled loop, where UF is the unroll factor. 482 using VectorParts = SmallVector<Value *, 2>; 483 484 /// Vectorize a single GetElementPtrInst based on information gathered and 485 /// decisions taken during planning. 486 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 487 ElementCount VF, bool IsPtrLoopInvariant, 488 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 489 490 /// Vectorize a single PHINode in a block. This method handles the induction 491 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 492 /// arbitrary length vectors. 493 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 494 495 /// A helper function to scalarize a single Instruction in the innermost loop. 496 /// Generates a sequence of scalar instances for each lane between \p MinLane 497 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 498 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 499 /// Instr's operands. 500 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 501 const VPIteration &Instance, bool IfPredicateInstr, 502 VPTransformState &State); 503 504 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 505 /// is provided, the integer induction variable will first be truncated to 506 /// the corresponding type. 507 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 508 509 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 510 /// vector or scalar value on-demand if one is not yet available. When 511 /// vectorizing a loop, we visit the definition of an instruction before its 512 /// uses. When visiting the definition, we either vectorize or scalarize the 513 /// instruction, creating an entry for it in the corresponding map. (In some 514 /// cases, such as induction variables, we will create both vector and scalar 515 /// entries.) Then, as we encounter uses of the definition, we derive values 516 /// for each scalar or vector use unless such a value is already available. 517 /// For example, if we scalarize a definition and one of its uses is vector, 518 /// we build the required vector on-demand with an insertelement sequence 519 /// when visiting the use. Otherwise, if the use is scalar, we can use the 520 /// existing scalar definition. 521 /// 522 /// Return a value in the new loop corresponding to \p V from the original 523 /// loop at unroll index \p Part. If the value has already been vectorized, 524 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 525 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 526 /// a new vector value on-demand by inserting the scalar values into a vector 527 /// with an insertelement sequence. If the value has been neither vectorized 528 /// nor scalarized, it must be loop invariant, so we simply broadcast the 529 /// value into a vector. 530 Value *getOrCreateVectorValue(Value *V, unsigned Part); 531 532 /// Return a value in the new loop corresponding to \p V from the original 533 /// loop at unroll and vector indices \p Instance. If the value has been 534 /// vectorized but not scalarized, the necessary extractelement instruction 535 /// will be generated. 536 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 537 538 /// Construct the vector value of a scalarized value \p V one lane at a time. 539 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 540 541 /// Try to vectorize interleaved access group \p Group with the base address 542 /// given in \p Addr, optionally masking the vector operations if \p 543 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 544 /// values in the vectorized loop. 545 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 546 VPTransformState &State, VPValue *Addr, 547 VPValue *BlockInMask = nullptr); 548 549 /// Vectorize Load and Store instructions with the base address given in \p 550 /// Addr, optionally masking the vector operations if \p BlockInMask is 551 /// non-null. Use \p State to translate given VPValues to IR values in the 552 /// vectorized loop. 553 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 554 VPValue *Addr, VPValue *StoredValue, 555 VPValue *BlockInMask); 556 557 /// Set the debug location in the builder using the debug location in 558 /// the instruction. 559 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 560 561 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 562 void fixNonInductionPHIs(void); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Create a new induction variable inside L. 582 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 583 Value *Step, Instruction *DL); 584 585 /// Handle all cross-iteration phis in the header. 586 void fixCrossIterationPHIs(); 587 588 /// Fix a first-order recurrence. This is the second phase of vectorizing 589 /// this phi node. 590 void fixFirstOrderRecurrence(PHINode *Phi); 591 592 /// Fix a reduction cross-iteration phi. This is the second phase of 593 /// vectorizing this phi node. 594 void fixReduction(PHINode *Phi); 595 596 /// Clear NSW/NUW flags from reduction instructions if necessary. 597 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 598 599 /// The Loop exit block may have single value PHI nodes with some 600 /// incoming value. While vectorizing we only handled real values 601 /// that were defined inside the loop and we should have one value for 602 /// each predecessor of its parent basic block. See PR14725. 603 void fixLCSSAPHIs(); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(); 612 613 /// Create a broadcast instruction. This method generates a broadcast 614 /// instruction (shuffle) for loop invariant values and for the induction 615 /// value. If this is the induction variable then we extend it to N, N+1, ... 616 /// this is needed because each iteration in the loop corresponds to a SIMD 617 /// element. 618 virtual Value *getBroadcastInstrs(Value *V); 619 620 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 624 Instruction::BinaryOps Opcode = 625 Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Instruction *EntryVal); 642 643 /// Returns true if an instruction \p I should be scalarized instead of 644 /// vectorized for the chosen vectorization factor. 645 bool shouldScalarizeInstruction(Instruction *I) const; 646 647 /// Returns true if we should generate a scalar version of \p IV. 648 bool needsScalarInduction(Instruction *IV) const; 649 650 /// If there is a cast involved in the induction variable \p ID, which should 651 /// be ignored in the vectorized loop body, this function records the 652 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 653 /// cast. We had already proved that the casted Phi is equal to the uncasted 654 /// Phi in the vectorized loop (under a runtime guard), and therefore 655 /// there is no need to vectorize the cast - the same value can be used in the 656 /// vector loop for both the Phi and the cast. 657 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 658 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 659 /// 660 /// \p EntryVal is the value from the original loop that maps to the vector 661 /// phi node and is used to distinguish what is the IV currently being 662 /// processed - original one (if \p EntryVal is a phi corresponding to the 663 /// original IV) or the "newly-created" one based on the proof mentioned above 664 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 665 /// latter case \p EntryVal is a TruncInst and we must not record anything for 666 /// that IV, but it's error-prone to expect callers of this routine to care 667 /// about that, hence this explicit parameter. 668 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 669 const Instruction *EntryVal, 670 Value *VectorLoopValue, 671 unsigned Part, 672 unsigned Lane = UINT_MAX); 673 674 /// Generate a shuffle sequence that will reverse the vector Vec. 675 virtual Value *reverseVector(Value *Vec); 676 677 /// Returns (and creates if needed) the original loop trip count. 678 Value *getOrCreateTripCount(Loop *NewLoop); 679 680 /// Returns (and creates if needed) the trip count of the widened loop. 681 Value *getOrCreateVectorTripCount(Loop *NewLoop); 682 683 /// Returns a bitcasted value to the requested vector type. 684 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 685 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 686 const DataLayout &DL); 687 688 /// Emit a bypass check to see if the vector trip count is zero, including if 689 /// it overflows. 690 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 691 692 /// Emit a bypass check to see if all of the SCEV assumptions we've 693 /// had to make are correct. 694 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 695 696 /// Emit bypass checks to check any memory assumptions we may have made. 697 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 718 719 /// Complete the loop skeleton by adding debug MDs, creating appropriate 720 /// conditional branches in the middle block, preparing the builder and 721 /// running the verifier. Take in the vector loop \p L as argument, and return 722 /// the preheader of the completed vector loop. 723 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 724 725 /// Add additional metadata to \p To that was not present on \p Orig. 726 /// 727 /// Currently this is used to add the noalias annotations based on the 728 /// inserted memchecks. Use this for instructions that are *cloned* into the 729 /// vector loop. 730 void addNewMetadata(Instruction *To, const Instruction *Orig); 731 732 /// Add metadata from one instruction to another. 733 /// 734 /// This includes both the original MDs from \p From and additional ones (\see 735 /// addNewMetadata). Use this for *newly created* instructions in the vector 736 /// loop. 737 void addMetadata(Instruction *To, Instruction *From); 738 739 /// Similar to the previous function but it adds the metadata to a 740 /// vector of instructions. 741 void addMetadata(ArrayRef<Value *> To, Instruction *From); 742 743 /// The original loop. 744 Loop *OrigLoop; 745 746 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 747 /// dynamic knowledge to simplify SCEV expressions and converts them to a 748 /// more usable form. 749 PredicatedScalarEvolution &PSE; 750 751 /// Loop Info. 752 LoopInfo *LI; 753 754 /// Dominator Tree. 755 DominatorTree *DT; 756 757 /// Alias Analysis. 758 AAResults *AA; 759 760 /// Target Library Info. 761 const TargetLibraryInfo *TLI; 762 763 /// Target Transform Info. 764 const TargetTransformInfo *TTI; 765 766 /// Assumption Cache. 767 AssumptionCache *AC; 768 769 /// Interface to emit optimization remarks. 770 OptimizationRemarkEmitter *ORE; 771 772 /// LoopVersioning. It's only set up (non-null) if memchecks were 773 /// used. 774 /// 775 /// This is currently only used to add no-alias metadata based on the 776 /// memchecks. The actually versioning is performed manually. 777 std::unique_ptr<LoopVersioning> LVer; 778 779 /// The vectorization SIMD factor to use. Each vector will have this many 780 /// vector elements. 781 ElementCount VF; 782 783 /// The vectorization unroll factor to use. Each scalar is vectorized to this 784 /// many different vector instructions. 785 unsigned UF; 786 787 /// The builder that we use 788 IRBuilder<> Builder; 789 790 // --- Vectorization state --- 791 792 /// The vector-loop preheader. 793 BasicBlock *LoopVectorPreHeader; 794 795 /// The scalar-loop preheader. 796 BasicBlock *LoopScalarPreHeader; 797 798 /// Middle Block between the vector and the scalar. 799 BasicBlock *LoopMiddleBlock; 800 801 /// The ExitBlock of the scalar loop. 802 BasicBlock *LoopExitBlock; 803 804 /// The vector loop body. 805 BasicBlock *LoopVectorBody; 806 807 /// The scalar loop body. 808 BasicBlock *LoopScalarBody; 809 810 /// A list of all bypass blocks. The first block is the entry of the loop. 811 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 812 813 /// The new Induction variable which was added to the new block. 814 PHINode *Induction = nullptr; 815 816 /// The induction variable of the old basic block. 817 PHINode *OldInduction = nullptr; 818 819 /// Maps values from the original loop to their corresponding values in the 820 /// vectorized loop. A key value can map to either vector values, scalar 821 /// values or both kinds of values, depending on whether the key was 822 /// vectorized and scalarized. 823 VectorizerValueMap VectorLoopValueMap; 824 825 /// Store instructions that were predicated. 826 SmallVector<Instruction *, 4> PredicatedInstructions; 827 828 /// Trip count of the original loop. 829 Value *TripCount = nullptr; 830 831 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 832 Value *VectorTripCount = nullptr; 833 834 /// The legality analysis. 835 LoopVectorizationLegality *Legal; 836 837 /// The profitablity analysis. 838 LoopVectorizationCostModel *Cost; 839 840 // Record whether runtime checks are added. 841 bool AddedSafetyChecks = false; 842 843 // Holds the end values for each induction variable. We save the end values 844 // so we can later fix-up the external users of the induction variables. 845 DenseMap<PHINode *, Value *> IVEndValues; 846 847 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 848 // fixed up at the end of vector code generation. 849 SmallVector<PHINode *, 8> OrigPHIsToFix; 850 851 /// BFI and PSI are used to check for profile guided size optimizations. 852 BlockFrequencyInfo *BFI; 853 ProfileSummaryInfo *PSI; 854 855 // Whether this loop should be optimized for size based on profile guided size 856 // optimizatios. 857 bool OptForSizeBasedOnProfile; 858 }; 859 860 class InnerLoopUnroller : public InnerLoopVectorizer { 861 public: 862 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 863 LoopInfo *LI, DominatorTree *DT, 864 const TargetLibraryInfo *TLI, 865 const TargetTransformInfo *TTI, AssumptionCache *AC, 866 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 867 LoopVectorizationLegality *LVL, 868 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 869 ProfileSummaryInfo *PSI) 870 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 871 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 872 BFI, PSI) {} 873 874 private: 875 Value *getBroadcastInstrs(Value *V) override; 876 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 877 Instruction::BinaryOps Opcode = 878 Instruction::BinaryOpsEnd) override; 879 Value *reverseVector(Value *Vec) override; 880 }; 881 882 } // end namespace llvm 883 884 /// Look for a meaningful debug location on the instruction or it's 885 /// operands. 886 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 887 if (!I) 888 return I; 889 890 DebugLoc Empty; 891 if (I->getDebugLoc() != Empty) 892 return I; 893 894 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 895 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 896 if (OpInst->getDebugLoc() != Empty) 897 return OpInst; 898 } 899 900 return I; 901 } 902 903 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 904 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 905 const DILocation *DIL = Inst->getDebugLoc(); 906 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 907 !isa<DbgInfoIntrinsic>(Inst)) { 908 assert(!VF.isScalable() && "scalable vectors not yet supported."); 909 auto NewDIL = 910 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 911 if (NewDIL) 912 B.SetCurrentDebugLocation(NewDIL.getValue()); 913 else 914 LLVM_DEBUG(dbgs() 915 << "Failed to create new discriminator: " 916 << DIL->getFilename() << " Line: " << DIL->getLine()); 917 } 918 else 919 B.SetCurrentDebugLocation(DIL); 920 } else 921 B.SetCurrentDebugLocation(DebugLoc()); 922 } 923 924 /// Write a record \p DebugMsg about vectorization failure to the debug 925 /// output stream. If \p I is passed, it is an instruction that prevents 926 /// vectorization. 927 #ifndef NDEBUG 928 static void debugVectorizationFailure(const StringRef DebugMsg, 929 Instruction *I) { 930 dbgs() << "LV: Not vectorizing: " << DebugMsg; 931 if (I != nullptr) 932 dbgs() << " " << *I; 933 else 934 dbgs() << '.'; 935 dbgs() << '\n'; 936 } 937 #endif 938 939 /// Create an analysis remark that explains why vectorization failed 940 /// 941 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 942 /// RemarkName is the identifier for the remark. If \p I is passed it is an 943 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 944 /// the location of the remark. \return the remark object that can be 945 /// streamed to. 946 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 947 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 948 Value *CodeRegion = TheLoop->getHeader(); 949 DebugLoc DL = TheLoop->getStartLoc(); 950 951 if (I) { 952 CodeRegion = I->getParent(); 953 // If there is no debug location attached to the instruction, revert back to 954 // using the loop's. 955 if (I->getDebugLoc()) 956 DL = I->getDebugLoc(); 957 } 958 959 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 960 R << "loop not vectorized: "; 961 return R; 962 } 963 964 namespace llvm { 965 966 void reportVectorizationFailure(const StringRef DebugMsg, 967 const StringRef OREMsg, const StringRef ORETag, 968 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 969 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 970 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 971 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 972 ORETag, TheLoop, I) << OREMsg); 973 } 974 975 } // end namespace llvm 976 977 #ifndef NDEBUG 978 /// \return string containing a file name and a line # for the given loop. 979 static std::string getDebugLocString(const Loop *L) { 980 std::string Result; 981 if (L) { 982 raw_string_ostream OS(Result); 983 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 984 LoopDbgLoc.print(OS); 985 else 986 // Just print the module name. 987 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 988 OS.flush(); 989 } 990 return Result; 991 } 992 #endif 993 994 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 995 const Instruction *Orig) { 996 // If the loop was versioned with memchecks, add the corresponding no-alias 997 // metadata. 998 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 999 LVer->annotateInstWithNoAlias(To, Orig); 1000 } 1001 1002 void InnerLoopVectorizer::addMetadata(Instruction *To, 1003 Instruction *From) { 1004 propagateMetadata(To, From); 1005 addNewMetadata(To, From); 1006 } 1007 1008 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1009 Instruction *From) { 1010 for (Value *V : To) { 1011 if (Instruction *I = dyn_cast<Instruction>(V)) 1012 addMetadata(I, From); 1013 } 1014 } 1015 1016 namespace llvm { 1017 1018 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1019 // lowered. 1020 enum ScalarEpilogueLowering { 1021 1022 // The default: allowing scalar epilogues. 1023 CM_ScalarEpilogueAllowed, 1024 1025 // Vectorization with OptForSize: don't allow epilogues. 1026 CM_ScalarEpilogueNotAllowedOptSize, 1027 1028 // A special case of vectorisation with OptForSize: loops with a very small 1029 // trip count are considered for vectorization under OptForSize, thereby 1030 // making sure the cost of their loop body is dominant, free of runtime 1031 // guards and scalar iteration overheads. 1032 CM_ScalarEpilogueNotAllowedLowTripLoop, 1033 1034 // Loop hint predicate indicating an epilogue is undesired. 1035 CM_ScalarEpilogueNotNeededUsePredicate 1036 }; 1037 1038 /// LoopVectorizationCostModel - estimates the expected speedups due to 1039 /// vectorization. 1040 /// In many cases vectorization is not profitable. This can happen because of 1041 /// a number of reasons. In this class we mainly attempt to predict the 1042 /// expected speedup/slowdowns due to the supported instruction set. We use the 1043 /// TargetTransformInfo to query the different backends for the cost of 1044 /// different operations. 1045 class LoopVectorizationCostModel { 1046 public: 1047 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1048 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1049 LoopVectorizationLegality *Legal, 1050 const TargetTransformInfo &TTI, 1051 const TargetLibraryInfo *TLI, DemandedBits *DB, 1052 AssumptionCache *AC, 1053 OptimizationRemarkEmitter *ORE, const Function *F, 1054 const LoopVectorizeHints *Hints, 1055 InterleavedAccessInfo &IAI) 1056 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1057 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1058 Hints(Hints), InterleaveInfo(IAI) {} 1059 1060 /// \return An upper bound for the vectorization factor, or None if 1061 /// vectorization and interleaving should be avoided up front. 1062 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1063 1064 /// \return True if runtime checks are required for vectorization, and false 1065 /// otherwise. 1066 bool runtimeChecksRequired(); 1067 1068 /// \return The most profitable vectorization factor and the cost of that VF. 1069 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1070 /// then this vectorization factor will be selected if vectorization is 1071 /// possible. 1072 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1073 1074 /// Setup cost-based decisions for user vectorization factor. 1075 void selectUserVectorizationFactor(ElementCount UserVF) { 1076 collectUniformsAndScalars(UserVF); 1077 collectInstsToScalarize(UserVF); 1078 } 1079 1080 /// \return The size (in bits) of the smallest and widest types in the code 1081 /// that needs to be vectorized. We ignore values that remain scalar such as 1082 /// 64 bit loop indices. 1083 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1084 1085 /// \return The desired interleave count. 1086 /// If interleave count has been specified by metadata it will be returned. 1087 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1088 /// are the selected vectorization factor and the cost of the selected VF. 1089 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1090 1091 /// Memory access instruction may be vectorized in more than one way. 1092 /// Form of instruction after vectorization depends on cost. 1093 /// This function takes cost-based decisions for Load/Store instructions 1094 /// and collects them in a map. This decisions map is used for building 1095 /// the lists of loop-uniform and loop-scalar instructions. 1096 /// The calculated cost is saved with widening decision in order to 1097 /// avoid redundant calculations. 1098 void setCostBasedWideningDecision(ElementCount VF); 1099 1100 /// A struct that represents some properties of the register usage 1101 /// of a loop. 1102 struct RegisterUsage { 1103 /// Holds the number of loop invariant values that are used in the loop. 1104 /// The key is ClassID of target-provided register class. 1105 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1106 /// Holds the maximum number of concurrent live intervals in the loop. 1107 /// The key is ClassID of target-provided register class. 1108 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1109 }; 1110 1111 /// \return Returns information about the register usages of the loop for the 1112 /// given vectorization factors. 1113 SmallVector<RegisterUsage, 8> 1114 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1115 1116 /// Collect values we want to ignore in the cost model. 1117 void collectValuesToIgnore(); 1118 1119 /// Split reductions into those that happen in the loop, and those that happen 1120 /// outside. In loop reductions are collected into InLoopReductionChains. 1121 void collectInLoopReductions(); 1122 1123 /// \returns The smallest bitwidth each instruction can be represented with. 1124 /// The vector equivalents of these instructions should be truncated to this 1125 /// type. 1126 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1127 return MinBWs; 1128 } 1129 1130 /// \returns True if it is more profitable to scalarize instruction \p I for 1131 /// vectorization factor \p VF. 1132 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1133 assert(VF.isVector() && 1134 "Profitable to scalarize relevant only for VF > 1."); 1135 1136 // Cost model is not run in the VPlan-native path - return conservative 1137 // result until this changes. 1138 if (EnableVPlanNativePath) 1139 return false; 1140 1141 auto Scalars = InstsToScalarize.find(VF); 1142 assert(Scalars != InstsToScalarize.end() && 1143 "VF not yet analyzed for scalarization profitability"); 1144 return Scalars->second.find(I) != Scalars->second.end(); 1145 } 1146 1147 /// Returns true if \p I is known to be uniform after vectorization. 1148 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1149 if (VF.isScalar()) 1150 return true; 1151 1152 // Cost model is not run in the VPlan-native path - return conservative 1153 // result until this changes. 1154 if (EnableVPlanNativePath) 1155 return false; 1156 1157 auto UniformsPerVF = Uniforms.find(VF); 1158 assert(UniformsPerVF != Uniforms.end() && 1159 "VF not yet analyzed for uniformity"); 1160 return UniformsPerVF->second.count(I); 1161 } 1162 1163 /// Returns true if \p I is known to be scalar after vectorization. 1164 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1165 if (VF.isScalar()) 1166 return true; 1167 1168 // Cost model is not run in the VPlan-native path - return conservative 1169 // result until this changes. 1170 if (EnableVPlanNativePath) 1171 return false; 1172 1173 auto ScalarsPerVF = Scalars.find(VF); 1174 assert(ScalarsPerVF != Scalars.end() && 1175 "Scalar values are not calculated for VF"); 1176 return ScalarsPerVF->second.count(I); 1177 } 1178 1179 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1180 /// for vectorization factor \p VF. 1181 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1182 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1183 !isProfitableToScalarize(I, VF) && 1184 !isScalarAfterVectorization(I, VF); 1185 } 1186 1187 /// Decision that was taken during cost calculation for memory instruction. 1188 enum InstWidening { 1189 CM_Unknown, 1190 CM_Widen, // For consecutive accesses with stride +1. 1191 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1192 CM_Interleave, 1193 CM_GatherScatter, 1194 CM_Scalarize 1195 }; 1196 1197 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1198 /// instruction \p I and vector width \p VF. 1199 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1200 unsigned Cost) { 1201 assert(VF.isVector() && "Expected VF >=2"); 1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1203 } 1204 1205 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1206 /// interleaving group \p Grp and vector width \p VF. 1207 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1208 ElementCount VF, InstWidening W, unsigned Cost) { 1209 assert(VF.isVector() && "Expected VF >=2"); 1210 /// Broadcast this decicion to all instructions inside the group. 1211 /// But the cost will be assigned to one instruction only. 1212 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1213 if (auto *I = Grp->getMember(i)) { 1214 if (Grp->getInsertPos() == I) 1215 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1216 else 1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1218 } 1219 } 1220 } 1221 1222 /// Return the cost model decision for the given instruction \p I and vector 1223 /// width \p VF. Return CM_Unknown if this instruction did not pass 1224 /// through the cost modeling. 1225 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1226 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1227 assert(VF.isVector() && "Expected VF >=2"); 1228 1229 // Cost model is not run in the VPlan-native path - return conservative 1230 // result until this changes. 1231 if (EnableVPlanNativePath) 1232 return CM_GatherScatter; 1233 1234 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1235 auto Itr = WideningDecisions.find(InstOnVF); 1236 if (Itr == WideningDecisions.end()) 1237 return CM_Unknown; 1238 return Itr->second.first; 1239 } 1240 1241 /// Return the vectorization cost for the given instruction \p I and vector 1242 /// width \p VF. 1243 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1244 assert(VF.isVector() && "Expected VF >=2"); 1245 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1246 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1247 "The cost is not calculated"); 1248 return WideningDecisions[InstOnVF].second; 1249 } 1250 1251 /// Return True if instruction \p I is an optimizable truncate whose operand 1252 /// is an induction variable. Such a truncate will be removed by adding a new 1253 /// induction variable with the destination type. 1254 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1255 // If the instruction is not a truncate, return false. 1256 auto *Trunc = dyn_cast<TruncInst>(I); 1257 if (!Trunc) 1258 return false; 1259 1260 // Get the source and destination types of the truncate. 1261 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1262 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1263 1264 // If the truncate is free for the given types, return false. Replacing a 1265 // free truncate with an induction variable would add an induction variable 1266 // update instruction to each iteration of the loop. We exclude from this 1267 // check the primary induction variable since it will need an update 1268 // instruction regardless. 1269 Value *Op = Trunc->getOperand(0); 1270 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1271 return false; 1272 1273 // If the truncated value is not an induction variable, return false. 1274 return Legal->isInductionPhi(Op); 1275 } 1276 1277 /// Collects the instructions to scalarize for each predicated instruction in 1278 /// the loop. 1279 void collectInstsToScalarize(ElementCount VF); 1280 1281 /// Collect Uniform and Scalar values for the given \p VF. 1282 /// The sets depend on CM decision for Load/Store instructions 1283 /// that may be vectorized as interleave, gather-scatter or scalarized. 1284 void collectUniformsAndScalars(ElementCount VF) { 1285 // Do the analysis once. 1286 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1287 return; 1288 setCostBasedWideningDecision(VF); 1289 collectLoopUniforms(VF); 1290 collectLoopScalars(VF); 1291 } 1292 1293 /// Returns true if the target machine supports masked store operation 1294 /// for the given \p DataType and kind of access to \p Ptr. 1295 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1296 return Legal->isConsecutivePtr(Ptr) && 1297 TTI.isLegalMaskedStore(DataType, Alignment); 1298 } 1299 1300 /// Returns true if the target machine supports masked load operation 1301 /// for the given \p DataType and kind of access to \p Ptr. 1302 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1303 return Legal->isConsecutivePtr(Ptr) && 1304 TTI.isLegalMaskedLoad(DataType, Alignment); 1305 } 1306 1307 /// Returns true if the target machine supports masked scatter operation 1308 /// for the given \p DataType. 1309 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1310 return TTI.isLegalMaskedScatter(DataType, Alignment); 1311 } 1312 1313 /// Returns true if the target machine supports masked gather operation 1314 /// for the given \p DataType. 1315 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1316 return TTI.isLegalMaskedGather(DataType, Alignment); 1317 } 1318 1319 /// Returns true if the target machine can represent \p V as a masked gather 1320 /// or scatter operation. 1321 bool isLegalGatherOrScatter(Value *V) { 1322 bool LI = isa<LoadInst>(V); 1323 bool SI = isa<StoreInst>(V); 1324 if (!LI && !SI) 1325 return false; 1326 auto *Ty = getMemInstValueType(V); 1327 Align Align = getLoadStoreAlignment(V); 1328 return (LI && isLegalMaskedGather(Ty, Align)) || 1329 (SI && isLegalMaskedScatter(Ty, Align)); 1330 } 1331 1332 /// Returns true if \p I is an instruction that will be scalarized with 1333 /// predication. Such instructions include conditional stores and 1334 /// instructions that may divide by zero. 1335 /// If a non-zero VF has been calculated, we check if I will be scalarized 1336 /// predication for that VF. 1337 bool isScalarWithPredication(Instruction *I, 1338 ElementCount VF = ElementCount::getFixed(1)); 1339 1340 // Returns true if \p I is an instruction that will be predicated either 1341 // through scalar predication or masked load/store or masked gather/scatter. 1342 // Superset of instructions that return true for isScalarWithPredication. 1343 bool isPredicatedInst(Instruction *I) { 1344 if (!blockNeedsPredication(I->getParent())) 1345 return false; 1346 // Loads and stores that need some form of masked operation are predicated 1347 // instructions. 1348 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1349 return Legal->isMaskRequired(I); 1350 return isScalarWithPredication(I); 1351 } 1352 1353 /// Returns true if \p I is a memory instruction with consecutive memory 1354 /// access that can be widened. 1355 bool 1356 memoryInstructionCanBeWidened(Instruction *I, 1357 ElementCount VF = ElementCount::getFixed(1)); 1358 1359 /// Returns true if \p I is a memory instruction in an interleaved-group 1360 /// of memory accesses that can be vectorized with wide vector loads/stores 1361 /// and shuffles. 1362 bool 1363 interleavedAccessCanBeWidened(Instruction *I, 1364 ElementCount VF = ElementCount::getFixed(1)); 1365 1366 /// Check if \p Instr belongs to any interleaved access group. 1367 bool isAccessInterleaved(Instruction *Instr) { 1368 return InterleaveInfo.isInterleaved(Instr); 1369 } 1370 1371 /// Get the interleaved access group that \p Instr belongs to. 1372 const InterleaveGroup<Instruction> * 1373 getInterleavedAccessGroup(Instruction *Instr) { 1374 return InterleaveInfo.getInterleaveGroup(Instr); 1375 } 1376 1377 /// Returns true if an interleaved group requires a scalar iteration 1378 /// to handle accesses with gaps, and there is nothing preventing us from 1379 /// creating a scalar epilogue. 1380 bool requiresScalarEpilogue() const { 1381 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1382 } 1383 1384 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1385 /// loop hint annotation. 1386 bool isScalarEpilogueAllowed() const { 1387 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1388 } 1389 1390 /// Returns true if all loop blocks should be masked to fold tail loop. 1391 bool foldTailByMasking() const { return FoldTailByMasking; } 1392 1393 bool blockNeedsPredication(BasicBlock *BB) { 1394 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1395 } 1396 1397 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1398 /// nodes to the chain of instructions representing the reductions. Uses a 1399 /// MapVector to ensure deterministic iteration order. 1400 using ReductionChainMap = 1401 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1402 1403 /// Return the chain of instructions representing an inloop reduction. 1404 const ReductionChainMap &getInLoopReductionChains() const { 1405 return InLoopReductionChains; 1406 } 1407 1408 /// Returns true if the Phi is part of an inloop reduction. 1409 bool isInLoopReduction(PHINode *Phi) const { 1410 return InLoopReductionChains.count(Phi); 1411 } 1412 1413 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1414 /// with factor VF. Return the cost of the instruction, including 1415 /// scalarization overhead if it's needed. 1416 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1417 1418 /// Estimate cost of a call instruction CI if it were vectorized with factor 1419 /// VF. Return the cost of the instruction, including scalarization overhead 1420 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1421 /// scalarized - 1422 /// i.e. either vector version isn't available, or is too expensive. 1423 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1424 bool &NeedToScalarize); 1425 1426 /// Invalidates decisions already taken by the cost model. 1427 void invalidateCostModelingDecisions() { 1428 WideningDecisions.clear(); 1429 Uniforms.clear(); 1430 Scalars.clear(); 1431 } 1432 1433 private: 1434 unsigned NumPredStores = 0; 1435 1436 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1437 /// than zero. One is returned if vectorization should best be avoided due 1438 /// to cost. 1439 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1440 1441 /// The vectorization cost is a combination of the cost itself and a boolean 1442 /// indicating whether any of the contributing operations will actually 1443 /// operate on 1444 /// vector values after type legalization in the backend. If this latter value 1445 /// is 1446 /// false, then all operations will be scalarized (i.e. no vectorization has 1447 /// actually taken place). 1448 using VectorizationCostTy = std::pair<unsigned, bool>; 1449 1450 /// Returns the expected execution cost. The unit of the cost does 1451 /// not matter because we use the 'cost' units to compare different 1452 /// vector widths. The cost that is returned is *not* normalized by 1453 /// the factor width. 1454 VectorizationCostTy expectedCost(ElementCount VF); 1455 1456 /// Returns the execution time cost of an instruction for a given vector 1457 /// width. Vector width of one means scalar. 1458 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1459 1460 /// The cost-computation logic from getInstructionCost which provides 1461 /// the vector type as an output parameter. 1462 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1463 1464 /// Calculate vectorization cost of memory instruction \p I. 1465 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1466 1467 /// The cost computation for scalarized memory instruction. 1468 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1469 1470 /// The cost computation for interleaving group of memory instructions. 1471 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1472 1473 /// The cost computation for Gather/Scatter instruction. 1474 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1475 1476 /// The cost computation for widening instruction \p I with consecutive 1477 /// memory access. 1478 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1479 1480 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1481 /// Load: scalar load + broadcast. 1482 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1483 /// element) 1484 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1485 1486 /// Estimate the overhead of scalarizing an instruction. This is a 1487 /// convenience wrapper for the type-based getScalarizationOverhead API. 1488 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1489 1490 /// Returns whether the instruction is a load or store and will be a emitted 1491 /// as a vector operation. 1492 bool isConsecutiveLoadOrStore(Instruction *I); 1493 1494 /// Returns true if an artificially high cost for emulated masked memrefs 1495 /// should be used. 1496 bool useEmulatedMaskMemRefHack(Instruction *I); 1497 1498 /// Map of scalar integer values to the smallest bitwidth they can be legally 1499 /// represented as. The vector equivalents of these values should be truncated 1500 /// to this type. 1501 MapVector<Instruction *, uint64_t> MinBWs; 1502 1503 /// A type representing the costs for instructions if they were to be 1504 /// scalarized rather than vectorized. The entries are Instruction-Cost 1505 /// pairs. 1506 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1507 1508 /// A set containing all BasicBlocks that are known to present after 1509 /// vectorization as a predicated block. 1510 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1511 1512 /// Records whether it is allowed to have the original scalar loop execute at 1513 /// least once. This may be needed as a fallback loop in case runtime 1514 /// aliasing/dependence checks fail, or to handle the tail/remainder 1515 /// iterations when the trip count is unknown or doesn't divide by the VF, 1516 /// or as a peel-loop to handle gaps in interleave-groups. 1517 /// Under optsize and when the trip count is very small we don't allow any 1518 /// iterations to execute in the scalar loop. 1519 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1520 1521 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1522 bool FoldTailByMasking = false; 1523 1524 /// A map holding scalar costs for different vectorization factors. The 1525 /// presence of a cost for an instruction in the mapping indicates that the 1526 /// instruction will be scalarized when vectorizing with the associated 1527 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1528 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1529 1530 /// Holds the instructions known to be uniform after vectorization. 1531 /// The data is collected per VF. 1532 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1533 1534 /// Holds the instructions known to be scalar after vectorization. 1535 /// The data is collected per VF. 1536 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1537 1538 /// Holds the instructions (address computations) that are forced to be 1539 /// scalarized. 1540 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1541 1542 /// PHINodes of the reductions that should be expanded in-loop along with 1543 /// their associated chains of reduction operations, in program order from top 1544 /// (PHI) to bottom 1545 ReductionChainMap InLoopReductionChains; 1546 1547 /// Returns the expected difference in cost from scalarizing the expression 1548 /// feeding a predicated instruction \p PredInst. The instructions to 1549 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1550 /// non-negative return value implies the expression will be scalarized. 1551 /// Currently, only single-use chains are considered for scalarization. 1552 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1553 ElementCount VF); 1554 1555 /// Collect the instructions that are uniform after vectorization. An 1556 /// instruction is uniform if we represent it with a single scalar value in 1557 /// the vectorized loop corresponding to each vector iteration. Examples of 1558 /// uniform instructions include pointer operands of consecutive or 1559 /// interleaved memory accesses. Note that although uniformity implies an 1560 /// instruction will be scalar, the reverse is not true. In general, a 1561 /// scalarized instruction will be represented by VF scalar values in the 1562 /// vectorized loop, each corresponding to an iteration of the original 1563 /// scalar loop. 1564 void collectLoopUniforms(ElementCount VF); 1565 1566 /// Collect the instructions that are scalar after vectorization. An 1567 /// instruction is scalar if it is known to be uniform or will be scalarized 1568 /// during vectorization. Non-uniform scalarized instructions will be 1569 /// represented by VF values in the vectorized loop, each corresponding to an 1570 /// iteration of the original scalar loop. 1571 void collectLoopScalars(ElementCount VF); 1572 1573 /// Keeps cost model vectorization decision and cost for instructions. 1574 /// Right now it is used for memory instructions only. 1575 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1576 std::pair<InstWidening, unsigned>>; 1577 1578 DecisionList WideningDecisions; 1579 1580 /// Returns true if \p V is expected to be vectorized and it needs to be 1581 /// extracted. 1582 bool needsExtract(Value *V, ElementCount VF) const { 1583 Instruction *I = dyn_cast<Instruction>(V); 1584 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1585 TheLoop->isLoopInvariant(I)) 1586 return false; 1587 1588 // Assume we can vectorize V (and hence we need extraction) if the 1589 // scalars are not computed yet. This can happen, because it is called 1590 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1591 // the scalars are collected. That should be a safe assumption in most 1592 // cases, because we check if the operands have vectorizable types 1593 // beforehand in LoopVectorizationLegality. 1594 return Scalars.find(VF) == Scalars.end() || 1595 !isScalarAfterVectorization(I, VF); 1596 }; 1597 1598 /// Returns a range containing only operands needing to be extracted. 1599 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1600 ElementCount VF) { 1601 return SmallVector<Value *, 4>(make_filter_range( 1602 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1603 } 1604 1605 public: 1606 /// The loop that we evaluate. 1607 Loop *TheLoop; 1608 1609 /// Predicated scalar evolution analysis. 1610 PredicatedScalarEvolution &PSE; 1611 1612 /// Loop Info analysis. 1613 LoopInfo *LI; 1614 1615 /// Vectorization legality. 1616 LoopVectorizationLegality *Legal; 1617 1618 /// Vector target information. 1619 const TargetTransformInfo &TTI; 1620 1621 /// Target Library Info. 1622 const TargetLibraryInfo *TLI; 1623 1624 /// Demanded bits analysis. 1625 DemandedBits *DB; 1626 1627 /// Assumption cache. 1628 AssumptionCache *AC; 1629 1630 /// Interface to emit optimization remarks. 1631 OptimizationRemarkEmitter *ORE; 1632 1633 const Function *TheFunction; 1634 1635 /// Loop Vectorize Hint. 1636 const LoopVectorizeHints *Hints; 1637 1638 /// The interleave access information contains groups of interleaved accesses 1639 /// with the same stride and close to each other. 1640 InterleavedAccessInfo &InterleaveInfo; 1641 1642 /// Values to ignore in the cost model. 1643 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1644 1645 /// Values to ignore in the cost model when VF > 1. 1646 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1647 }; 1648 1649 } // end namespace llvm 1650 1651 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1652 // vectorization. The loop needs to be annotated with #pragma omp simd 1653 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1654 // vector length information is not provided, vectorization is not considered 1655 // explicit. Interleave hints are not allowed either. These limitations will be 1656 // relaxed in the future. 1657 // Please, note that we are currently forced to abuse the pragma 'clang 1658 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1659 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1660 // provides *explicit vectorization hints* (LV can bypass legal checks and 1661 // assume that vectorization is legal). However, both hints are implemented 1662 // using the same metadata (llvm.loop.vectorize, processed by 1663 // LoopVectorizeHints). This will be fixed in the future when the native IR 1664 // representation for pragma 'omp simd' is introduced. 1665 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1666 OptimizationRemarkEmitter *ORE) { 1667 assert(!OuterLp->empty() && "This is not an outer loop"); 1668 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1669 1670 // Only outer loops with an explicit vectorization hint are supported. 1671 // Unannotated outer loops are ignored. 1672 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1673 return false; 1674 1675 Function *Fn = OuterLp->getHeader()->getParent(); 1676 if (!Hints.allowVectorization(Fn, OuterLp, 1677 true /*VectorizeOnlyWhenForced*/)) { 1678 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1679 return false; 1680 } 1681 1682 if (Hints.getInterleave() > 1) { 1683 // TODO: Interleave support is future work. 1684 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1685 "outer loops.\n"); 1686 Hints.emitRemarkWithHints(); 1687 return false; 1688 } 1689 1690 return true; 1691 } 1692 1693 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1694 OptimizationRemarkEmitter *ORE, 1695 SmallVectorImpl<Loop *> &V) { 1696 // Collect inner loops and outer loops without irreducible control flow. For 1697 // now, only collect outer loops that have explicit vectorization hints. If we 1698 // are stress testing the VPlan H-CFG construction, we collect the outermost 1699 // loop of every loop nest. 1700 if (L.empty() || VPlanBuildStressTest || 1701 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1702 LoopBlocksRPO RPOT(&L); 1703 RPOT.perform(LI); 1704 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1705 V.push_back(&L); 1706 // TODO: Collect inner loops inside marked outer loops in case 1707 // vectorization fails for the outer loop. Do not invoke 1708 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1709 // already known to be reducible. We can use an inherited attribute for 1710 // that. 1711 return; 1712 } 1713 } 1714 for (Loop *InnerL : L) 1715 collectSupportedLoops(*InnerL, LI, ORE, V); 1716 } 1717 1718 namespace { 1719 1720 /// The LoopVectorize Pass. 1721 struct LoopVectorize : public FunctionPass { 1722 /// Pass identification, replacement for typeid 1723 static char ID; 1724 1725 LoopVectorizePass Impl; 1726 1727 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1728 bool VectorizeOnlyWhenForced = false) 1729 : FunctionPass(ID), 1730 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1731 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1732 } 1733 1734 bool runOnFunction(Function &F) override { 1735 if (skipFunction(F)) 1736 return false; 1737 1738 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1739 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1740 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1741 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1742 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1743 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1744 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1745 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1746 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1747 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1748 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1749 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1750 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1751 1752 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1753 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1754 1755 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1756 GetLAA, *ORE, PSI).MadeAnyChange; 1757 } 1758 1759 void getAnalysisUsage(AnalysisUsage &AU) const override { 1760 AU.addRequired<AssumptionCacheTracker>(); 1761 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1762 AU.addRequired<DominatorTreeWrapperPass>(); 1763 AU.addRequired<LoopInfoWrapperPass>(); 1764 AU.addRequired<ScalarEvolutionWrapperPass>(); 1765 AU.addRequired<TargetTransformInfoWrapperPass>(); 1766 AU.addRequired<AAResultsWrapperPass>(); 1767 AU.addRequired<LoopAccessLegacyAnalysis>(); 1768 AU.addRequired<DemandedBitsWrapperPass>(); 1769 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1770 AU.addRequired<InjectTLIMappingsLegacy>(); 1771 1772 // We currently do not preserve loopinfo/dominator analyses with outer loop 1773 // vectorization. Until this is addressed, mark these analyses as preserved 1774 // only for non-VPlan-native path. 1775 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1776 if (!EnableVPlanNativePath) { 1777 AU.addPreserved<LoopInfoWrapperPass>(); 1778 AU.addPreserved<DominatorTreeWrapperPass>(); 1779 } 1780 1781 AU.addPreserved<BasicAAWrapperPass>(); 1782 AU.addPreserved<GlobalsAAWrapperPass>(); 1783 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1784 } 1785 }; 1786 1787 } // end anonymous namespace 1788 1789 //===----------------------------------------------------------------------===// 1790 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1791 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1792 //===----------------------------------------------------------------------===// 1793 1794 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1795 // We need to place the broadcast of invariant variables outside the loop, 1796 // but only if it's proven safe to do so. Else, broadcast will be inside 1797 // vector loop body. 1798 Instruction *Instr = dyn_cast<Instruction>(V); 1799 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1800 (!Instr || 1801 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1802 // Place the code for broadcasting invariant variables in the new preheader. 1803 IRBuilder<>::InsertPointGuard Guard(Builder); 1804 if (SafeToHoist) 1805 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1806 1807 // Broadcast the scalar into all locations in the vector. 1808 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1809 1810 return Shuf; 1811 } 1812 1813 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1814 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1815 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1816 "Expected either an induction phi-node or a truncate of it!"); 1817 Value *Start = II.getStartValue(); 1818 1819 // Construct the initial value of the vector IV in the vector loop preheader 1820 auto CurrIP = Builder.saveIP(); 1821 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1822 if (isa<TruncInst>(EntryVal)) { 1823 assert(Start->getType()->isIntegerTy() && 1824 "Truncation requires an integer type"); 1825 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1826 Step = Builder.CreateTrunc(Step, TruncType); 1827 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1828 } 1829 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1830 Value *SteppedStart = 1831 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1832 1833 // We create vector phi nodes for both integer and floating-point induction 1834 // variables. Here, we determine the kind of arithmetic we will perform. 1835 Instruction::BinaryOps AddOp; 1836 Instruction::BinaryOps MulOp; 1837 if (Step->getType()->isIntegerTy()) { 1838 AddOp = Instruction::Add; 1839 MulOp = Instruction::Mul; 1840 } else { 1841 AddOp = II.getInductionOpcode(); 1842 MulOp = Instruction::FMul; 1843 } 1844 1845 // Multiply the vectorization factor by the step using integer or 1846 // floating-point arithmetic as appropriate. 1847 Value *ConstVF = 1848 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1849 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1850 1851 // Create a vector splat to use in the induction update. 1852 // 1853 // FIXME: If the step is non-constant, we create the vector splat with 1854 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1855 // handle a constant vector splat. 1856 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1857 Value *SplatVF = isa<Constant>(Mul) 1858 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1859 : Builder.CreateVectorSplat(VF, Mul); 1860 Builder.restoreIP(CurrIP); 1861 1862 // We may need to add the step a number of times, depending on the unroll 1863 // factor. The last of those goes into the PHI. 1864 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1865 &*LoopVectorBody->getFirstInsertionPt()); 1866 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1867 Instruction *LastInduction = VecInd; 1868 for (unsigned Part = 0; Part < UF; ++Part) { 1869 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1870 1871 if (isa<TruncInst>(EntryVal)) 1872 addMetadata(LastInduction, EntryVal); 1873 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1874 1875 LastInduction = cast<Instruction>(addFastMathFlag( 1876 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1877 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1878 } 1879 1880 // Move the last step to the end of the latch block. This ensures consistent 1881 // placement of all induction updates. 1882 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1883 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1884 auto *ICmp = cast<Instruction>(Br->getCondition()); 1885 LastInduction->moveBefore(ICmp); 1886 LastInduction->setName("vec.ind.next"); 1887 1888 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1889 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1890 } 1891 1892 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1893 return Cost->isScalarAfterVectorization(I, VF) || 1894 Cost->isProfitableToScalarize(I, VF); 1895 } 1896 1897 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1898 if (shouldScalarizeInstruction(IV)) 1899 return true; 1900 auto isScalarInst = [&](User *U) -> bool { 1901 auto *I = cast<Instruction>(U); 1902 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1903 }; 1904 return llvm::any_of(IV->users(), isScalarInst); 1905 } 1906 1907 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1908 const InductionDescriptor &ID, const Instruction *EntryVal, 1909 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1910 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1911 "Expected either an induction phi-node or a truncate of it!"); 1912 1913 // This induction variable is not the phi from the original loop but the 1914 // newly-created IV based on the proof that casted Phi is equal to the 1915 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1916 // re-uses the same InductionDescriptor that original IV uses but we don't 1917 // have to do any recording in this case - that is done when original IV is 1918 // processed. 1919 if (isa<TruncInst>(EntryVal)) 1920 return; 1921 1922 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1923 if (Casts.empty()) 1924 return; 1925 // Only the first Cast instruction in the Casts vector is of interest. 1926 // The rest of the Casts (if exist) have no uses outside the 1927 // induction update chain itself. 1928 Instruction *CastInst = *Casts.begin(); 1929 if (Lane < UINT_MAX) 1930 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1931 else 1932 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1933 } 1934 1935 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1936 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1937 "Primary induction variable must have an integer type"); 1938 1939 auto II = Legal->getInductionVars().find(IV); 1940 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1941 1942 auto ID = II->second; 1943 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1944 1945 // The value from the original loop to which we are mapping the new induction 1946 // variable. 1947 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1948 1949 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1950 1951 // Generate code for the induction step. Note that induction steps are 1952 // required to be loop-invariant 1953 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1954 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1955 "Induction step should be loop invariant"); 1956 if (PSE.getSE()->isSCEVable(IV->getType())) { 1957 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1958 return Exp.expandCodeFor(Step, Step->getType(), 1959 LoopVectorPreHeader->getTerminator()); 1960 } 1961 return cast<SCEVUnknown>(Step)->getValue(); 1962 }; 1963 1964 // The scalar value to broadcast. This is derived from the canonical 1965 // induction variable. If a truncation type is given, truncate the canonical 1966 // induction variable and step. Otherwise, derive these values from the 1967 // induction descriptor. 1968 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1969 Value *ScalarIV = Induction; 1970 if (IV != OldInduction) { 1971 ScalarIV = IV->getType()->isIntegerTy() 1972 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1973 : Builder.CreateCast(Instruction::SIToFP, Induction, 1974 IV->getType()); 1975 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1976 ScalarIV->setName("offset.idx"); 1977 } 1978 if (Trunc) { 1979 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1980 assert(Step->getType()->isIntegerTy() && 1981 "Truncation requires an integer step"); 1982 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1983 Step = Builder.CreateTrunc(Step, TruncType); 1984 } 1985 return ScalarIV; 1986 }; 1987 1988 // Create the vector values from the scalar IV, in the absence of creating a 1989 // vector IV. 1990 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1991 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1992 for (unsigned Part = 0; Part < UF; ++Part) { 1993 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1994 Value *EntryPart = 1995 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1996 ID.getInductionOpcode()); 1997 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1998 if (Trunc) 1999 addMetadata(EntryPart, Trunc); 2000 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2001 } 2002 }; 2003 2004 // Now do the actual transformations, and start with creating the step value. 2005 Value *Step = CreateStepValue(ID.getStep()); 2006 if (VF.isZero() || VF.isScalar()) { 2007 Value *ScalarIV = CreateScalarIV(Step); 2008 CreateSplatIV(ScalarIV, Step); 2009 return; 2010 } 2011 2012 // Determine if we want a scalar version of the induction variable. This is 2013 // true if the induction variable itself is not widened, or if it has at 2014 // least one user in the loop that is not widened. 2015 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2016 if (!NeedsScalarIV) { 2017 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2018 return; 2019 } 2020 2021 // Try to create a new independent vector induction variable. If we can't 2022 // create the phi node, we will splat the scalar induction variable in each 2023 // loop iteration. 2024 if (!shouldScalarizeInstruction(EntryVal)) { 2025 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2026 Value *ScalarIV = CreateScalarIV(Step); 2027 // Create scalar steps that can be used by instructions we will later 2028 // scalarize. Note that the addition of the scalar steps will not increase 2029 // the number of instructions in the loop in the common case prior to 2030 // InstCombine. We will be trading one vector extract for each scalar step. 2031 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2032 return; 2033 } 2034 2035 // All IV users are scalar instructions, so only emit a scalar IV, not a 2036 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2037 // predicate used by the masked loads/stores. 2038 Value *ScalarIV = CreateScalarIV(Step); 2039 if (!Cost->isScalarEpilogueAllowed()) 2040 CreateSplatIV(ScalarIV, Step); 2041 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2042 } 2043 2044 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2045 Instruction::BinaryOps BinOp) { 2046 // Create and check the types. 2047 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2048 int VLen = ValVTy->getNumElements(); 2049 2050 Type *STy = Val->getType()->getScalarType(); 2051 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2052 "Induction Step must be an integer or FP"); 2053 assert(Step->getType() == STy && "Step has wrong type"); 2054 2055 SmallVector<Constant *, 8> Indices; 2056 2057 if (STy->isIntegerTy()) { 2058 // Create a vector of consecutive numbers from zero to VF. 2059 for (int i = 0; i < VLen; ++i) 2060 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2061 2062 // Add the consecutive indices to the vector value. 2063 Constant *Cv = ConstantVector::get(Indices); 2064 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2065 Step = Builder.CreateVectorSplat(VLen, Step); 2066 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2067 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2068 // which can be found from the original scalar operations. 2069 Step = Builder.CreateMul(Cv, Step); 2070 return Builder.CreateAdd(Val, Step, "induction"); 2071 } 2072 2073 // Floating point induction. 2074 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2075 "Binary Opcode should be specified for FP induction"); 2076 // Create a vector of consecutive numbers from zero to VF. 2077 for (int i = 0; i < VLen; ++i) 2078 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2079 2080 // Add the consecutive indices to the vector value. 2081 Constant *Cv = ConstantVector::get(Indices); 2082 2083 Step = Builder.CreateVectorSplat(VLen, Step); 2084 2085 // Floating point operations had to be 'fast' to enable the induction. 2086 FastMathFlags Flags; 2087 Flags.setFast(); 2088 2089 Value *MulOp = Builder.CreateFMul(Cv, Step); 2090 if (isa<Instruction>(MulOp)) 2091 // Have to check, MulOp may be a constant 2092 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2093 2094 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2095 if (isa<Instruction>(BOp)) 2096 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2097 return BOp; 2098 } 2099 2100 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2101 Instruction *EntryVal, 2102 const InductionDescriptor &ID) { 2103 // We shouldn't have to build scalar steps if we aren't vectorizing. 2104 assert(VF.isVector() && "VF should be greater than one"); 2105 assert(!VF.isScalable() && 2106 "the code below assumes a fixed number of elements at compile time"); 2107 // Get the value type and ensure it and the step have the same integer type. 2108 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2109 assert(ScalarIVTy == Step->getType() && 2110 "Val and Step should have the same type"); 2111 2112 // We build scalar steps for both integer and floating-point induction 2113 // variables. Here, we determine the kind of arithmetic we will perform. 2114 Instruction::BinaryOps AddOp; 2115 Instruction::BinaryOps MulOp; 2116 if (ScalarIVTy->isIntegerTy()) { 2117 AddOp = Instruction::Add; 2118 MulOp = Instruction::Mul; 2119 } else { 2120 AddOp = ID.getInductionOpcode(); 2121 MulOp = Instruction::FMul; 2122 } 2123 2124 // Determine the number of scalars we need to generate for each unroll 2125 // iteration. If EntryVal is uniform, we only need to generate the first 2126 // lane. Otherwise, we generate all VF values. 2127 unsigned Lanes = 2128 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2129 ? 1 2130 : VF.getKnownMinValue(); 2131 // Compute the scalar steps and save the results in VectorLoopValueMap. 2132 for (unsigned Part = 0; Part < UF; ++Part) { 2133 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2134 auto *StartIdx = getSignedIntOrFpConstant( 2135 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2136 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2137 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2138 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2139 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2140 } 2141 } 2142 } 2143 2144 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2145 assert(V != Induction && "The new induction variable should not be used."); 2146 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2147 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2148 2149 // If we have a stride that is replaced by one, do it here. Defer this for 2150 // the VPlan-native path until we start running Legal checks in that path. 2151 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2152 V = ConstantInt::get(V->getType(), 1); 2153 2154 // If we have a vector mapped to this value, return it. 2155 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2156 return VectorLoopValueMap.getVectorValue(V, Part); 2157 2158 // If the value has not been vectorized, check if it has been scalarized 2159 // instead. If it has been scalarized, and we actually need the value in 2160 // vector form, we will construct the vector values on demand. 2161 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2162 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2163 2164 // If we've scalarized a value, that value should be an instruction. 2165 auto *I = cast<Instruction>(V); 2166 2167 // If we aren't vectorizing, we can just copy the scalar map values over to 2168 // the vector map. 2169 if (VF == 1) { 2170 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2171 return ScalarValue; 2172 } 2173 2174 // Get the last scalar instruction we generated for V and Part. If the value 2175 // is known to be uniform after vectorization, this corresponds to lane zero 2176 // of the Part unroll iteration. Otherwise, the last instruction is the one 2177 // we created for the last vector lane of the Part unroll iteration. 2178 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2179 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2180 ? 0 2181 : VF.getKnownMinValue() - 1; 2182 auto *LastInst = cast<Instruction>( 2183 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2184 2185 // Set the insert point after the last scalarized instruction. This ensures 2186 // the insertelement sequence will directly follow the scalar definitions. 2187 auto OldIP = Builder.saveIP(); 2188 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2189 Builder.SetInsertPoint(&*NewIP); 2190 2191 // However, if we are vectorizing, we need to construct the vector values. 2192 // If the value is known to be uniform after vectorization, we can just 2193 // broadcast the scalar value corresponding to lane zero for each unroll 2194 // iteration. Otherwise, we construct the vector values using insertelement 2195 // instructions. Since the resulting vectors are stored in 2196 // VectorLoopValueMap, we will only generate the insertelements once. 2197 Value *VectorValue = nullptr; 2198 if (Cost->isUniformAfterVectorization(I, VF)) { 2199 VectorValue = getBroadcastInstrs(ScalarValue); 2200 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2201 } else { 2202 // Initialize packing with insertelements to start from undef. 2203 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2204 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2205 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2206 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2207 packScalarIntoVectorValue(V, {Part, Lane}); 2208 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2209 } 2210 Builder.restoreIP(OldIP); 2211 return VectorValue; 2212 } 2213 2214 // If this scalar is unknown, assume that it is a constant or that it is 2215 // loop invariant. Broadcast V and save the value for future uses. 2216 Value *B = getBroadcastInstrs(V); 2217 VectorLoopValueMap.setVectorValue(V, Part, B); 2218 return B; 2219 } 2220 2221 Value * 2222 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2223 const VPIteration &Instance) { 2224 // If the value is not an instruction contained in the loop, it should 2225 // already be scalar. 2226 if (OrigLoop->isLoopInvariant(V)) 2227 return V; 2228 2229 assert(Instance.Lane > 0 2230 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2231 : true && "Uniform values only have lane zero"); 2232 2233 // If the value from the original loop has not been vectorized, it is 2234 // represented by UF x VF scalar values in the new loop. Return the requested 2235 // scalar value. 2236 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2237 return VectorLoopValueMap.getScalarValue(V, Instance); 2238 2239 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2240 // for the given unroll part. If this entry is not a vector type (i.e., the 2241 // vectorization factor is one), there is no need to generate an 2242 // extractelement instruction. 2243 auto *U = getOrCreateVectorValue(V, Instance.Part); 2244 if (!U->getType()->isVectorTy()) { 2245 assert(VF == 1 && "Value not scalarized has non-vector type"); 2246 return U; 2247 } 2248 2249 // Otherwise, the value from the original loop has been vectorized and is 2250 // represented by UF vector values. Extract and return the requested scalar 2251 // value from the appropriate vector lane. 2252 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2253 } 2254 2255 void InnerLoopVectorizer::packScalarIntoVectorValue( 2256 Value *V, const VPIteration &Instance) { 2257 assert(V != Induction && "The new induction variable should not be used."); 2258 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2259 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2260 2261 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2262 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2263 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2264 Builder.getInt32(Instance.Lane)); 2265 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2266 } 2267 2268 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2269 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2270 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2271 SmallVector<int, 8> ShuffleMask; 2272 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2273 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2274 2275 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2276 ShuffleMask, "reverse"); 2277 } 2278 2279 // Return whether we allow using masked interleave-groups (for dealing with 2280 // strided loads/stores that reside in predicated blocks, or for dealing 2281 // with gaps). 2282 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2283 // If an override option has been passed in for interleaved accesses, use it. 2284 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2285 return EnableMaskedInterleavedMemAccesses; 2286 2287 return TTI.enableMaskedInterleavedAccessVectorization(); 2288 } 2289 2290 // Try to vectorize the interleave group that \p Instr belongs to. 2291 // 2292 // E.g. Translate following interleaved load group (factor = 3): 2293 // for (i = 0; i < N; i+=3) { 2294 // R = Pic[i]; // Member of index 0 2295 // G = Pic[i+1]; // Member of index 1 2296 // B = Pic[i+2]; // Member of index 2 2297 // ... // do something to R, G, B 2298 // } 2299 // To: 2300 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2301 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2302 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2303 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2304 // 2305 // Or translate following interleaved store group (factor = 3): 2306 // for (i = 0; i < N; i+=3) { 2307 // ... do something to R, G, B 2308 // Pic[i] = R; // Member of index 0 2309 // Pic[i+1] = G; // Member of index 1 2310 // Pic[i+2] = B; // Member of index 2 2311 // } 2312 // To: 2313 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2314 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2315 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2316 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2317 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2318 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2319 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2320 VPValue *Addr, VPValue *BlockInMask) { 2321 Instruction *Instr = Group->getInsertPos(); 2322 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2323 2324 // Prepare for the vector type of the interleaved load/store. 2325 Type *ScalarTy = getMemInstValueType(Instr); 2326 unsigned InterleaveFactor = Group->getFactor(); 2327 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2328 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2329 2330 // Prepare for the new pointers. 2331 SmallVector<Value *, 2> AddrParts; 2332 unsigned Index = Group->getIndex(Instr); 2333 2334 // TODO: extend the masked interleaved-group support to reversed access. 2335 assert((!BlockInMask || !Group->isReverse()) && 2336 "Reversed masked interleave-group not supported."); 2337 2338 // If the group is reverse, adjust the index to refer to the last vector lane 2339 // instead of the first. We adjust the index from the first vector lane, 2340 // rather than directly getting the pointer for lane VF - 1, because the 2341 // pointer operand of the interleaved access is supposed to be uniform. For 2342 // uniform instructions, we're only required to generate a value for the 2343 // first vector lane in each unroll iteration. 2344 assert(!VF.isScalable() && 2345 "scalable vector reverse operation is not implemented"); 2346 if (Group->isReverse()) 2347 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2348 2349 for (unsigned Part = 0; Part < UF; Part++) { 2350 Value *AddrPart = State.get(Addr, {Part, 0}); 2351 setDebugLocFromInst(Builder, AddrPart); 2352 2353 // Notice current instruction could be any index. Need to adjust the address 2354 // to the member of index 0. 2355 // 2356 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2357 // b = A[i]; // Member of index 0 2358 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2359 // 2360 // E.g. A[i+1] = a; // Member of index 1 2361 // A[i] = b; // Member of index 0 2362 // A[i+2] = c; // Member of index 2 (Current instruction) 2363 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2364 2365 bool InBounds = false; 2366 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2367 InBounds = gep->isInBounds(); 2368 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2369 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2370 2371 // Cast to the vector pointer type. 2372 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2373 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2374 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2375 } 2376 2377 setDebugLocFromInst(Builder, Instr); 2378 Value *UndefVec = UndefValue::get(VecTy); 2379 2380 Value *MaskForGaps = nullptr; 2381 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2382 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2383 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2384 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2385 } 2386 2387 // Vectorize the interleaved load group. 2388 if (isa<LoadInst>(Instr)) { 2389 // For each unroll part, create a wide load for the group. 2390 SmallVector<Value *, 2> NewLoads; 2391 for (unsigned Part = 0; Part < UF; Part++) { 2392 Instruction *NewLoad; 2393 if (BlockInMask || MaskForGaps) { 2394 assert(useMaskedInterleavedAccesses(*TTI) && 2395 "masked interleaved groups are not allowed."); 2396 Value *GroupMask = MaskForGaps; 2397 if (BlockInMask) { 2398 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2399 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2400 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2401 Value *ShuffledMask = Builder.CreateShuffleVector( 2402 BlockInMaskPart, Undefs, 2403 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2404 "interleaved.mask"); 2405 GroupMask = MaskForGaps 2406 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2407 MaskForGaps) 2408 : ShuffledMask; 2409 } 2410 NewLoad = 2411 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2412 GroupMask, UndefVec, "wide.masked.vec"); 2413 } 2414 else 2415 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2416 Group->getAlign(), "wide.vec"); 2417 Group->addMetadata(NewLoad); 2418 NewLoads.push_back(NewLoad); 2419 } 2420 2421 // For each member in the group, shuffle out the appropriate data from the 2422 // wide loads. 2423 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2424 Instruction *Member = Group->getMember(I); 2425 2426 // Skip the gaps in the group. 2427 if (!Member) 2428 continue; 2429 2430 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2431 auto StrideMask = 2432 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2433 for (unsigned Part = 0; Part < UF; Part++) { 2434 Value *StridedVec = Builder.CreateShuffleVector( 2435 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2436 2437 // If this member has different type, cast the result type. 2438 if (Member->getType() != ScalarTy) { 2439 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2440 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2441 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2442 } 2443 2444 if (Group->isReverse()) 2445 StridedVec = reverseVector(StridedVec); 2446 2447 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2448 } 2449 } 2450 return; 2451 } 2452 2453 // The sub vector type for current instruction. 2454 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2455 auto *SubVT = VectorType::get(ScalarTy, VF); 2456 2457 // Vectorize the interleaved store group. 2458 for (unsigned Part = 0; Part < UF; Part++) { 2459 // Collect the stored vector from each member. 2460 SmallVector<Value *, 4> StoredVecs; 2461 for (unsigned i = 0; i < InterleaveFactor; i++) { 2462 // Interleaved store group doesn't allow a gap, so each index has a member 2463 Instruction *Member = Group->getMember(i); 2464 assert(Member && "Fail to get a member from an interleaved store group"); 2465 2466 Value *StoredVec = getOrCreateVectorValue( 2467 cast<StoreInst>(Member)->getValueOperand(), Part); 2468 if (Group->isReverse()) 2469 StoredVec = reverseVector(StoredVec); 2470 2471 // If this member has different type, cast it to a unified type. 2472 2473 if (StoredVec->getType() != SubVT) 2474 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2475 2476 StoredVecs.push_back(StoredVec); 2477 } 2478 2479 // Concatenate all vectors into a wide vector. 2480 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2481 2482 // Interleave the elements in the wide vector. 2483 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2484 Value *IVec = Builder.CreateShuffleVector( 2485 WideVec, UndefVec, 2486 createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2487 "interleaved.vec"); 2488 2489 Instruction *NewStoreInstr; 2490 if (BlockInMask) { 2491 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2492 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2493 Value *ShuffledMask = Builder.CreateShuffleVector( 2494 BlockInMaskPart, Undefs, 2495 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2496 "interleaved.mask"); 2497 NewStoreInstr = Builder.CreateMaskedStore( 2498 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2499 } 2500 else 2501 NewStoreInstr = 2502 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2503 2504 Group->addMetadata(NewStoreInstr); 2505 } 2506 } 2507 2508 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2509 VPTransformState &State, 2510 VPValue *Addr, 2511 VPValue *StoredValue, 2512 VPValue *BlockInMask) { 2513 // Attempt to issue a wide load. 2514 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2515 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2516 2517 assert((LI || SI) && "Invalid Load/Store instruction"); 2518 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2519 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2520 2521 LoopVectorizationCostModel::InstWidening Decision = 2522 Cost->getWideningDecision(Instr, VF); 2523 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2524 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2525 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2526 "CM decision is not to widen the memory instruction"); 2527 2528 Type *ScalarDataTy = getMemInstValueType(Instr); 2529 2530 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2531 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2532 const Align Alignment = getLoadStoreAlignment(Instr); 2533 2534 // Determine if the pointer operand of the access is either consecutive or 2535 // reverse consecutive. 2536 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2537 bool ConsecutiveStride = 2538 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2539 bool CreateGatherScatter = 2540 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2541 2542 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2543 // gather/scatter. Otherwise Decision should have been to Scalarize. 2544 assert((ConsecutiveStride || CreateGatherScatter) && 2545 "The instruction should be scalarized"); 2546 (void)ConsecutiveStride; 2547 2548 VectorParts BlockInMaskParts(UF); 2549 bool isMaskRequired = BlockInMask; 2550 if (isMaskRequired) 2551 for (unsigned Part = 0; Part < UF; ++Part) 2552 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2553 2554 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2555 // Calculate the pointer for the specific unroll-part. 2556 GetElementPtrInst *PartPtr = nullptr; 2557 2558 bool InBounds = false; 2559 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2560 InBounds = gep->isInBounds(); 2561 2562 if (Reverse) { 2563 // If the address is consecutive but reversed, then the 2564 // wide store needs to start at the last vector element. 2565 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2566 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2567 PartPtr->setIsInBounds(InBounds); 2568 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2569 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2570 PartPtr->setIsInBounds(InBounds); 2571 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2572 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2573 } else { 2574 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2575 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2576 PartPtr->setIsInBounds(InBounds); 2577 } 2578 2579 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2580 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2581 }; 2582 2583 // Handle Stores: 2584 if (SI) { 2585 setDebugLocFromInst(Builder, SI); 2586 2587 for (unsigned Part = 0; Part < UF; ++Part) { 2588 Instruction *NewSI = nullptr; 2589 Value *StoredVal = State.get(StoredValue, Part); 2590 if (CreateGatherScatter) { 2591 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2592 Value *VectorGep = State.get(Addr, Part); 2593 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2594 MaskPart); 2595 } else { 2596 if (Reverse) { 2597 // If we store to reverse consecutive memory locations, then we need 2598 // to reverse the order of elements in the stored value. 2599 StoredVal = reverseVector(StoredVal); 2600 // We don't want to update the value in the map as it might be used in 2601 // another expression. So don't call resetVectorValue(StoredVal). 2602 } 2603 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2604 if (isMaskRequired) 2605 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2606 BlockInMaskParts[Part]); 2607 else 2608 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2609 } 2610 addMetadata(NewSI, SI); 2611 } 2612 return; 2613 } 2614 2615 // Handle loads. 2616 assert(LI && "Must have a load instruction"); 2617 setDebugLocFromInst(Builder, LI); 2618 for (unsigned Part = 0; Part < UF; ++Part) { 2619 Value *NewLI; 2620 if (CreateGatherScatter) { 2621 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2622 Value *VectorGep = State.get(Addr, Part); 2623 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2624 nullptr, "wide.masked.gather"); 2625 addMetadata(NewLI, LI); 2626 } else { 2627 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2628 if (isMaskRequired) 2629 NewLI = Builder.CreateMaskedLoad( 2630 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2631 "wide.masked.load"); 2632 else 2633 NewLI = 2634 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2635 2636 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2637 addMetadata(NewLI, LI); 2638 if (Reverse) 2639 NewLI = reverseVector(NewLI); 2640 } 2641 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2642 } 2643 } 2644 2645 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2646 const VPIteration &Instance, 2647 bool IfPredicateInstr, 2648 VPTransformState &State) { 2649 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2650 2651 setDebugLocFromInst(Builder, Instr); 2652 2653 // Does this instruction return a value ? 2654 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2655 2656 Instruction *Cloned = Instr->clone(); 2657 if (!IsVoidRetTy) 2658 Cloned->setName(Instr->getName() + ".cloned"); 2659 2660 // Replace the operands of the cloned instructions with their scalar 2661 // equivalents in the new loop. 2662 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2663 auto *NewOp = State.get(User.getOperand(op), Instance); 2664 Cloned->setOperand(op, NewOp); 2665 } 2666 addNewMetadata(Cloned, Instr); 2667 2668 // Place the cloned scalar in the new loop. 2669 Builder.Insert(Cloned); 2670 2671 // Add the cloned scalar to the scalar map entry. 2672 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2673 2674 // If we just cloned a new assumption, add it the assumption cache. 2675 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2676 if (II->getIntrinsicID() == Intrinsic::assume) 2677 AC->registerAssumption(II); 2678 2679 // End if-block. 2680 if (IfPredicateInstr) 2681 PredicatedInstructions.push_back(Cloned); 2682 } 2683 2684 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2685 Value *End, Value *Step, 2686 Instruction *DL) { 2687 BasicBlock *Header = L->getHeader(); 2688 BasicBlock *Latch = L->getLoopLatch(); 2689 // As we're just creating this loop, it's possible no latch exists 2690 // yet. If so, use the header as this will be a single block loop. 2691 if (!Latch) 2692 Latch = Header; 2693 2694 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2695 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2696 setDebugLocFromInst(Builder, OldInst); 2697 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2698 2699 Builder.SetInsertPoint(Latch->getTerminator()); 2700 setDebugLocFromInst(Builder, OldInst); 2701 2702 // Create i+1 and fill the PHINode. 2703 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2704 Induction->addIncoming(Start, L->getLoopPreheader()); 2705 Induction->addIncoming(Next, Latch); 2706 // Create the compare. 2707 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2708 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2709 2710 // Now we have two terminators. Remove the old one from the block. 2711 Latch->getTerminator()->eraseFromParent(); 2712 2713 return Induction; 2714 } 2715 2716 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2717 if (TripCount) 2718 return TripCount; 2719 2720 assert(L && "Create Trip Count for null loop."); 2721 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2722 // Find the loop boundaries. 2723 ScalarEvolution *SE = PSE.getSE(); 2724 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2725 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2726 "Invalid loop count"); 2727 2728 Type *IdxTy = Legal->getWidestInductionType(); 2729 assert(IdxTy && "No type for induction"); 2730 2731 // The exit count might have the type of i64 while the phi is i32. This can 2732 // happen if we have an induction variable that is sign extended before the 2733 // compare. The only way that we get a backedge taken count is that the 2734 // induction variable was signed and as such will not overflow. In such a case 2735 // truncation is legal. 2736 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2737 IdxTy->getPrimitiveSizeInBits()) 2738 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2739 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2740 2741 // Get the total trip count from the count by adding 1. 2742 const SCEV *ExitCount = SE->getAddExpr( 2743 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2744 2745 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2746 2747 // Expand the trip count and place the new instructions in the preheader. 2748 // Notice that the pre-header does not change, only the loop body. 2749 SCEVExpander Exp(*SE, DL, "induction"); 2750 2751 // Count holds the overall loop count (N). 2752 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2753 L->getLoopPreheader()->getTerminator()); 2754 2755 if (TripCount->getType()->isPointerTy()) 2756 TripCount = 2757 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2758 L->getLoopPreheader()->getTerminator()); 2759 2760 return TripCount; 2761 } 2762 2763 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2764 if (VectorTripCount) 2765 return VectorTripCount; 2766 2767 Value *TC = getOrCreateTripCount(L); 2768 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2769 2770 Type *Ty = TC->getType(); 2771 // This is where we can make the step a runtime constant. 2772 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2773 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2774 2775 // If the tail is to be folded by masking, round the number of iterations N 2776 // up to a multiple of Step instead of rounding down. This is done by first 2777 // adding Step-1 and then rounding down. Note that it's ok if this addition 2778 // overflows: the vector induction variable will eventually wrap to zero given 2779 // that it starts at zero and its Step is a power of two; the loop will then 2780 // exit, with the last early-exit vector comparison also producing all-true. 2781 if (Cost->foldTailByMasking()) { 2782 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2783 "VF*UF must be a power of 2 when folding tail by masking"); 2784 TC = Builder.CreateAdd( 2785 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2786 } 2787 2788 // Now we need to generate the expression for the part of the loop that the 2789 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2790 // iterations are not required for correctness, or N - Step, otherwise. Step 2791 // is equal to the vectorization factor (number of SIMD elements) times the 2792 // unroll factor (number of SIMD instructions). 2793 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2794 2795 // If there is a non-reversed interleaved group that may speculatively access 2796 // memory out-of-bounds, we need to ensure that there will be at least one 2797 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2798 // the trip count, we set the remainder to be equal to the step. If the step 2799 // does not evenly divide the trip count, no adjustment is necessary since 2800 // there will already be scalar iterations. Note that the minimum iterations 2801 // check ensures that N >= Step. 2802 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2803 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2804 R = Builder.CreateSelect(IsZero, Step, R); 2805 } 2806 2807 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2808 2809 return VectorTripCount; 2810 } 2811 2812 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2813 const DataLayout &DL) { 2814 // Verify that V is a vector type with same number of elements as DstVTy. 2815 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2816 unsigned VF = DstFVTy->getNumElements(); 2817 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2818 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2819 Type *SrcElemTy = SrcVecTy->getElementType(); 2820 Type *DstElemTy = DstFVTy->getElementType(); 2821 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2822 "Vector elements must have same size"); 2823 2824 // Do a direct cast if element types are castable. 2825 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2826 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2827 } 2828 // V cannot be directly casted to desired vector type. 2829 // May happen when V is a floating point vector but DstVTy is a vector of 2830 // pointers or vice-versa. Handle this using a two-step bitcast using an 2831 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2832 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2833 "Only one type should be a pointer type"); 2834 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2835 "Only one type should be a floating point type"); 2836 Type *IntTy = 2837 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2838 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2839 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2840 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2841 } 2842 2843 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2844 BasicBlock *Bypass) { 2845 Value *Count = getOrCreateTripCount(L); 2846 // Reuse existing vector loop preheader for TC checks. 2847 // Note that new preheader block is generated for vector loop. 2848 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2849 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2850 2851 // Generate code to check if the loop's trip count is less than VF * UF, or 2852 // equal to it in case a scalar epilogue is required; this implies that the 2853 // vector trip count is zero. This check also covers the case where adding one 2854 // to the backedge-taken count overflowed leading to an incorrect trip count 2855 // of zero. In this case we will also jump to the scalar loop. 2856 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2857 : ICmpInst::ICMP_ULT; 2858 2859 // If tail is to be folded, vector loop takes care of all iterations. 2860 Value *CheckMinIters = Builder.getFalse(); 2861 if (!Cost->foldTailByMasking()) { 2862 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2863 CheckMinIters = Builder.CreateICmp( 2864 P, Count, 2865 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2866 "min.iters.check"); 2867 } 2868 // Create new preheader for vector loop. 2869 LoopVectorPreHeader = 2870 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2871 "vector.ph"); 2872 2873 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2874 DT->getNode(Bypass)->getIDom()) && 2875 "TC check is expected to dominate Bypass"); 2876 2877 // Update dominator for Bypass & LoopExit. 2878 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2879 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2880 2881 ReplaceInstWithInst( 2882 TCCheckBlock->getTerminator(), 2883 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2884 LoopBypassBlocks.push_back(TCCheckBlock); 2885 } 2886 2887 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2888 // Reuse existing vector loop preheader for SCEV checks. 2889 // Note that new preheader block is generated for vector loop. 2890 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2891 2892 // Generate the code to check that the SCEV assumptions that we made. 2893 // We want the new basic block to start at the first instruction in a 2894 // sequence of instructions that form a check. 2895 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2896 "scev.check"); 2897 Value *SCEVCheck = Exp.expandCodeForPredicate( 2898 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2899 2900 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2901 if (C->isZero()) 2902 return; 2903 2904 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2905 (OptForSizeBasedOnProfile && 2906 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2907 "Cannot SCEV check stride or overflow when optimizing for size"); 2908 2909 SCEVCheckBlock->setName("vector.scevcheck"); 2910 // Create new preheader for vector loop. 2911 LoopVectorPreHeader = 2912 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2913 nullptr, "vector.ph"); 2914 2915 // Update dominator only if this is first RT check. 2916 if (LoopBypassBlocks.empty()) { 2917 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2918 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2919 } 2920 2921 ReplaceInstWithInst( 2922 SCEVCheckBlock->getTerminator(), 2923 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2924 LoopBypassBlocks.push_back(SCEVCheckBlock); 2925 AddedSafetyChecks = true; 2926 } 2927 2928 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2929 // VPlan-native path does not do any analysis for runtime checks currently. 2930 if (EnableVPlanNativePath) 2931 return; 2932 2933 // Reuse existing vector loop preheader for runtime memory checks. 2934 // Note that new preheader block is generated for vector loop. 2935 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2936 2937 // Generate the code that checks in runtime if arrays overlap. We put the 2938 // checks into a separate block to make the more common case of few elements 2939 // faster. 2940 auto *LAI = Legal->getLAI(); 2941 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2942 if (!RtPtrChecking.Need) 2943 return; 2944 2945 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2946 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2947 "Cannot emit memory checks when optimizing for size, unless forced " 2948 "to vectorize."); 2949 ORE->emit([&]() { 2950 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2951 L->getStartLoc(), L->getHeader()) 2952 << "Code-size may be reduced by not forcing " 2953 "vectorization, or by source-code modifications " 2954 "eliminating the need for runtime checks " 2955 "(e.g., adding 'restrict')."; 2956 }); 2957 } 2958 2959 MemCheckBlock->setName("vector.memcheck"); 2960 // Create new preheader for vector loop. 2961 LoopVectorPreHeader = 2962 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2963 "vector.ph"); 2964 2965 auto *CondBranch = cast<BranchInst>( 2966 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 2967 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 2968 LoopBypassBlocks.push_back(MemCheckBlock); 2969 AddedSafetyChecks = true; 2970 2971 // Update dominator only if this is first RT check. 2972 if (LoopBypassBlocks.empty()) { 2973 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2974 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2975 } 2976 2977 Instruction *FirstCheckInst; 2978 Instruction *MemRuntimeCheck; 2979 std::tie(FirstCheckInst, MemRuntimeCheck) = 2980 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2981 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2982 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2983 "claimed checks are required"); 2984 CondBranch->setCondition(MemRuntimeCheck); 2985 2986 // We currently don't use LoopVersioning for the actual loop cloning but we 2987 // still use it to add the noalias metadata. 2988 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2989 PSE.getSE()); 2990 LVer->prepareNoAliasMetadata(); 2991 } 2992 2993 Value *InnerLoopVectorizer::emitTransformedIndex( 2994 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2995 const InductionDescriptor &ID) const { 2996 2997 SCEVExpander Exp(*SE, DL, "induction"); 2998 auto Step = ID.getStep(); 2999 auto StartValue = ID.getStartValue(); 3000 assert(Index->getType() == Step->getType() && 3001 "Index type does not match StepValue type"); 3002 3003 // Note: the IR at this point is broken. We cannot use SE to create any new 3004 // SCEV and then expand it, hoping that SCEV's simplification will give us 3005 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3006 // lead to various SCEV crashes. So all we can do is to use builder and rely 3007 // on InstCombine for future simplifications. Here we handle some trivial 3008 // cases only. 3009 auto CreateAdd = [&B](Value *X, Value *Y) { 3010 assert(X->getType() == Y->getType() && "Types don't match!"); 3011 if (auto *CX = dyn_cast<ConstantInt>(X)) 3012 if (CX->isZero()) 3013 return Y; 3014 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3015 if (CY->isZero()) 3016 return X; 3017 return B.CreateAdd(X, Y); 3018 }; 3019 3020 auto CreateMul = [&B](Value *X, Value *Y) { 3021 assert(X->getType() == Y->getType() && "Types don't match!"); 3022 if (auto *CX = dyn_cast<ConstantInt>(X)) 3023 if (CX->isOne()) 3024 return Y; 3025 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3026 if (CY->isOne()) 3027 return X; 3028 return B.CreateMul(X, Y); 3029 }; 3030 3031 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3032 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3033 // the DomTree is not kept up-to-date for additional blocks generated in the 3034 // vector loop. By using the header as insertion point, we guarantee that the 3035 // expanded instructions dominate all their uses. 3036 auto GetInsertPoint = [this, &B]() { 3037 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3038 if (InsertBB != LoopVectorBody && 3039 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3040 return LoopVectorBody->getTerminator(); 3041 return &*B.GetInsertPoint(); 3042 }; 3043 switch (ID.getKind()) { 3044 case InductionDescriptor::IK_IntInduction: { 3045 assert(Index->getType() == StartValue->getType() && 3046 "Index type does not match StartValue type"); 3047 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3048 return B.CreateSub(StartValue, Index); 3049 auto *Offset = CreateMul( 3050 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3051 return CreateAdd(StartValue, Offset); 3052 } 3053 case InductionDescriptor::IK_PtrInduction: { 3054 assert(isa<SCEVConstant>(Step) && 3055 "Expected constant step for pointer induction"); 3056 return B.CreateGEP( 3057 StartValue->getType()->getPointerElementType(), StartValue, 3058 CreateMul(Index, 3059 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3060 } 3061 case InductionDescriptor::IK_FpInduction: { 3062 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3063 auto InductionBinOp = ID.getInductionBinOp(); 3064 assert(InductionBinOp && 3065 (InductionBinOp->getOpcode() == Instruction::FAdd || 3066 InductionBinOp->getOpcode() == Instruction::FSub) && 3067 "Original bin op should be defined for FP induction"); 3068 3069 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3070 3071 // Floating point operations had to be 'fast' to enable the induction. 3072 FastMathFlags Flags; 3073 Flags.setFast(); 3074 3075 Value *MulExp = B.CreateFMul(StepValue, Index); 3076 if (isa<Instruction>(MulExp)) 3077 // We have to check, the MulExp may be a constant. 3078 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3079 3080 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3081 "induction"); 3082 if (isa<Instruction>(BOp)) 3083 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3084 3085 return BOp; 3086 } 3087 case InductionDescriptor::IK_NoInduction: 3088 return nullptr; 3089 } 3090 llvm_unreachable("invalid enum"); 3091 } 3092 3093 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3094 LoopScalarBody = OrigLoop->getHeader(); 3095 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3096 LoopExitBlock = OrigLoop->getExitBlock(); 3097 assert(LoopExitBlock && "Must have an exit block"); 3098 assert(LoopVectorPreHeader && "Invalid loop structure"); 3099 3100 LoopMiddleBlock = 3101 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3102 LI, nullptr, Twine(Prefix) + "middle.block"); 3103 LoopScalarPreHeader = 3104 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3105 nullptr, Twine(Prefix) + "scalar.ph"); 3106 // We intentionally don't let SplitBlock to update LoopInfo since 3107 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3108 // LoopVectorBody is explicitly added to the correct place few lines later. 3109 LoopVectorBody = 3110 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3111 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3112 3113 // Update dominator for loop exit. 3114 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3115 3116 // Create and register the new vector loop. 3117 Loop *Lp = LI->AllocateLoop(); 3118 Loop *ParentLoop = OrigLoop->getParentLoop(); 3119 3120 // Insert the new loop into the loop nest and register the new basic blocks 3121 // before calling any utilities such as SCEV that require valid LoopInfo. 3122 if (ParentLoop) { 3123 ParentLoop->addChildLoop(Lp); 3124 } else { 3125 LI->addTopLevelLoop(Lp); 3126 } 3127 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3128 return Lp; 3129 } 3130 3131 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3132 Value *VectorTripCount) { 3133 assert(VectorTripCount && L && "Expected valid arguments"); 3134 // We are going to resume the execution of the scalar loop. 3135 // Go over all of the induction variables that we found and fix the 3136 // PHIs that are left in the scalar version of the loop. 3137 // The starting values of PHI nodes depend on the counter of the last 3138 // iteration in the vectorized loop. 3139 // If we come from a bypass edge then we need to start from the original 3140 // start value. 3141 for (auto &InductionEntry : Legal->getInductionVars()) { 3142 PHINode *OrigPhi = InductionEntry.first; 3143 InductionDescriptor II = InductionEntry.second; 3144 3145 // Create phi nodes to merge from the backedge-taken check block. 3146 PHINode *BCResumeVal = 3147 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3148 LoopScalarPreHeader->getTerminator()); 3149 // Copy original phi DL over to the new one. 3150 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3151 Value *&EndValue = IVEndValues[OrigPhi]; 3152 if (OrigPhi == OldInduction) { 3153 // We know what the end value is. 3154 EndValue = VectorTripCount; 3155 } else { 3156 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3157 Type *StepType = II.getStep()->getType(); 3158 Instruction::CastOps CastOp = 3159 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3160 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3161 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3162 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3163 EndValue->setName("ind.end"); 3164 } 3165 3166 // The new PHI merges the original incoming value, in case of a bypass, 3167 // or the value at the end of the vectorized loop. 3168 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3169 3170 // Fix the scalar body counter (PHI node). 3171 // The old induction's phi node in the scalar body needs the truncated 3172 // value. 3173 for (BasicBlock *BB : LoopBypassBlocks) 3174 BCResumeVal->addIncoming(II.getStartValue(), BB); 3175 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3176 } 3177 } 3178 3179 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3180 MDNode *OrigLoopID) { 3181 assert(L && "Expected valid loop."); 3182 3183 // The trip counts should be cached by now. 3184 Value *Count = getOrCreateTripCount(L); 3185 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3186 3187 // We need the OrigLoop (scalar loop part) latch terminator to help 3188 // produce correct debug info for the middle block BB instructions. 3189 // The legality check stage guarantees that the loop will have a single 3190 // latch. 3191 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3192 "Scalar loop latch terminator isn't a branch"); 3193 BranchInst *ScalarLatchBr = 3194 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3195 3196 // Add a check in the middle block to see if we have completed 3197 // all of the iterations in the first vector loop. 3198 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3199 // If tail is to be folded, we know we don't need to run the remainder. 3200 Value *CmpN = Builder.getTrue(); 3201 if (!Cost->foldTailByMasking()) { 3202 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3203 VectorTripCount, "cmp.n", 3204 LoopMiddleBlock->getTerminator()); 3205 3206 // Here we use the same DebugLoc as the scalar loop latch branch instead 3207 // of the corresponding compare because they may have ended up with 3208 // different line numbers and we want to avoid awkward line stepping while 3209 // debugging. Eg. if the compare has got a line number inside the loop. 3210 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3211 } 3212 3213 BranchInst *BrInst = 3214 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3215 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3216 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3217 3218 // Get ready to start creating new instructions into the vectorized body. 3219 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3220 "Inconsistent vector loop preheader"); 3221 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3222 3223 Optional<MDNode *> VectorizedLoopID = 3224 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3225 LLVMLoopVectorizeFollowupVectorized}); 3226 if (VectorizedLoopID.hasValue()) { 3227 L->setLoopID(VectorizedLoopID.getValue()); 3228 3229 // Do not setAlreadyVectorized if loop attributes have been defined 3230 // explicitly. 3231 return LoopVectorPreHeader; 3232 } 3233 3234 // Keep all loop hints from the original loop on the vector loop (we'll 3235 // replace the vectorizer-specific hints below). 3236 if (MDNode *LID = OrigLoop->getLoopID()) 3237 L->setLoopID(LID); 3238 3239 LoopVectorizeHints Hints(L, true, *ORE); 3240 Hints.setAlreadyVectorized(); 3241 3242 #ifdef EXPENSIVE_CHECKS 3243 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3244 LI->verify(*DT); 3245 #endif 3246 3247 return LoopVectorPreHeader; 3248 } 3249 3250 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3251 /* 3252 In this function we generate a new loop. The new loop will contain 3253 the vectorized instructions while the old loop will continue to run the 3254 scalar remainder. 3255 3256 [ ] <-- loop iteration number check. 3257 / | 3258 / v 3259 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3260 | / | 3261 | / v 3262 || [ ] <-- vector pre header. 3263 |/ | 3264 | v 3265 | [ ] \ 3266 | [ ]_| <-- vector loop. 3267 | | 3268 | v 3269 | -[ ] <--- middle-block. 3270 | / | 3271 | / v 3272 -|- >[ ] <--- new preheader. 3273 | | 3274 | v 3275 | [ ] \ 3276 | [ ]_| <-- old scalar loop to handle remainder. 3277 \ | 3278 \ v 3279 >[ ] <-- exit block. 3280 ... 3281 */ 3282 3283 // Get the metadata of the original loop before it gets modified. 3284 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3285 3286 // Create an empty vector loop, and prepare basic blocks for the runtime 3287 // checks. 3288 Loop *Lp = createVectorLoopSkeleton(""); 3289 3290 // Now, compare the new count to zero. If it is zero skip the vector loop and 3291 // jump to the scalar loop. This check also covers the case where the 3292 // backedge-taken count is uint##_max: adding one to it will overflow leading 3293 // to an incorrect trip count of zero. In this (rare) case we will also jump 3294 // to the scalar loop. 3295 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3296 3297 // Generate the code to check any assumptions that we've made for SCEV 3298 // expressions. 3299 emitSCEVChecks(Lp, LoopScalarPreHeader); 3300 3301 // Generate the code that checks in runtime if arrays overlap. We put the 3302 // checks into a separate block to make the more common case of few elements 3303 // faster. 3304 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3305 3306 // Some loops have a single integer induction variable, while other loops 3307 // don't. One example is c++ iterators that often have multiple pointer 3308 // induction variables. In the code below we also support a case where we 3309 // don't have a single induction variable. 3310 // 3311 // We try to obtain an induction variable from the original loop as hard 3312 // as possible. However if we don't find one that: 3313 // - is an integer 3314 // - counts from zero, stepping by one 3315 // - is the size of the widest induction variable type 3316 // then we create a new one. 3317 OldInduction = Legal->getPrimaryInduction(); 3318 Type *IdxTy = Legal->getWidestInductionType(); 3319 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3320 // The loop step is equal to the vectorization factor (num of SIMD elements) 3321 // times the unroll factor (num of SIMD instructions). 3322 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3323 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3324 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3325 Induction = 3326 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3327 getDebugLocFromInstOrOperands(OldInduction)); 3328 3329 // Emit phis for the new starting index of the scalar loop. 3330 createInductionResumeValues(Lp, CountRoundDown); 3331 3332 return completeLoopSkeleton(Lp, OrigLoopID); 3333 } 3334 3335 // Fix up external users of the induction variable. At this point, we are 3336 // in LCSSA form, with all external PHIs that use the IV having one input value, 3337 // coming from the remainder loop. We need those PHIs to also have a correct 3338 // value for the IV when arriving directly from the middle block. 3339 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3340 const InductionDescriptor &II, 3341 Value *CountRoundDown, Value *EndValue, 3342 BasicBlock *MiddleBlock) { 3343 // There are two kinds of external IV usages - those that use the value 3344 // computed in the last iteration (the PHI) and those that use the penultimate 3345 // value (the value that feeds into the phi from the loop latch). 3346 // We allow both, but they, obviously, have different values. 3347 3348 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3349 3350 DenseMap<Value *, Value *> MissingVals; 3351 3352 // An external user of the last iteration's value should see the value that 3353 // the remainder loop uses to initialize its own IV. 3354 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3355 for (User *U : PostInc->users()) { 3356 Instruction *UI = cast<Instruction>(U); 3357 if (!OrigLoop->contains(UI)) { 3358 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3359 MissingVals[UI] = EndValue; 3360 } 3361 } 3362 3363 // An external user of the penultimate value need to see EndValue - Step. 3364 // The simplest way to get this is to recompute it from the constituent SCEVs, 3365 // that is Start + (Step * (CRD - 1)). 3366 for (User *U : OrigPhi->users()) { 3367 auto *UI = cast<Instruction>(U); 3368 if (!OrigLoop->contains(UI)) { 3369 const DataLayout &DL = 3370 OrigLoop->getHeader()->getModule()->getDataLayout(); 3371 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3372 3373 IRBuilder<> B(MiddleBlock->getTerminator()); 3374 Value *CountMinusOne = B.CreateSub( 3375 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3376 Value *CMO = 3377 !II.getStep()->getType()->isIntegerTy() 3378 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3379 II.getStep()->getType()) 3380 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3381 CMO->setName("cast.cmo"); 3382 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3383 Escape->setName("ind.escape"); 3384 MissingVals[UI] = Escape; 3385 } 3386 } 3387 3388 for (auto &I : MissingVals) { 3389 PHINode *PHI = cast<PHINode>(I.first); 3390 // One corner case we have to handle is two IVs "chasing" each-other, 3391 // that is %IV2 = phi [...], [ %IV1, %latch ] 3392 // In this case, if IV1 has an external use, we need to avoid adding both 3393 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3394 // don't already have an incoming value for the middle block. 3395 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3396 PHI->addIncoming(I.second, MiddleBlock); 3397 } 3398 } 3399 3400 namespace { 3401 3402 struct CSEDenseMapInfo { 3403 static bool canHandle(const Instruction *I) { 3404 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3405 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3406 } 3407 3408 static inline Instruction *getEmptyKey() { 3409 return DenseMapInfo<Instruction *>::getEmptyKey(); 3410 } 3411 3412 static inline Instruction *getTombstoneKey() { 3413 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3414 } 3415 3416 static unsigned getHashValue(const Instruction *I) { 3417 assert(canHandle(I) && "Unknown instruction!"); 3418 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3419 I->value_op_end())); 3420 } 3421 3422 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3423 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3424 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3425 return LHS == RHS; 3426 return LHS->isIdenticalTo(RHS); 3427 } 3428 }; 3429 3430 } // end anonymous namespace 3431 3432 ///Perform cse of induction variable instructions. 3433 static void cse(BasicBlock *BB) { 3434 // Perform simple cse. 3435 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3436 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3437 Instruction *In = &*I++; 3438 3439 if (!CSEDenseMapInfo::canHandle(In)) 3440 continue; 3441 3442 // Check if we can replace this instruction with any of the 3443 // visited instructions. 3444 if (Instruction *V = CSEMap.lookup(In)) { 3445 In->replaceAllUsesWith(V); 3446 In->eraseFromParent(); 3447 continue; 3448 } 3449 3450 CSEMap[In] = In; 3451 } 3452 } 3453 3454 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3455 ElementCount VF, 3456 bool &NeedToScalarize) { 3457 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3458 Function *F = CI->getCalledFunction(); 3459 Type *ScalarRetTy = CI->getType(); 3460 SmallVector<Type *, 4> Tys, ScalarTys; 3461 for (auto &ArgOp : CI->arg_operands()) 3462 ScalarTys.push_back(ArgOp->getType()); 3463 3464 // Estimate cost of scalarized vector call. The source operands are assumed 3465 // to be vectors, so we need to extract individual elements from there, 3466 // execute VF scalar calls, and then gather the result into the vector return 3467 // value. 3468 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3469 TTI::TCK_RecipThroughput); 3470 if (VF.isScalar()) 3471 return ScalarCallCost; 3472 3473 // Compute corresponding vector type for return value and arguments. 3474 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3475 for (Type *ScalarTy : ScalarTys) 3476 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3477 3478 // Compute costs of unpacking argument values for the scalar calls and 3479 // packing the return values to a vector. 3480 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3481 3482 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3483 3484 // If we can't emit a vector call for this function, then the currently found 3485 // cost is the cost we need to return. 3486 NeedToScalarize = true; 3487 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3488 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3489 3490 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3491 return Cost; 3492 3493 // If the corresponding vector cost is cheaper, return its cost. 3494 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3495 TTI::TCK_RecipThroughput); 3496 if (VectorCallCost < Cost) { 3497 NeedToScalarize = false; 3498 return VectorCallCost; 3499 } 3500 return Cost; 3501 } 3502 3503 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3504 ElementCount VF) { 3505 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3506 assert(ID && "Expected intrinsic call!"); 3507 3508 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3509 return TTI.getIntrinsicInstrCost(CostAttrs, 3510 TargetTransformInfo::TCK_RecipThroughput); 3511 } 3512 3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3517 } 3518 3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3523 } 3524 3525 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3526 // For every instruction `I` in MinBWs, truncate the operands, create a 3527 // truncated version of `I` and reextend its result. InstCombine runs 3528 // later and will remove any ext/trunc pairs. 3529 SmallPtrSet<Value *, 4> Erased; 3530 for (const auto &KV : Cost->getMinimalBitwidths()) { 3531 // If the value wasn't vectorized, we must maintain the original scalar 3532 // type. The absence of the value from VectorLoopValueMap indicates that it 3533 // wasn't vectorized. 3534 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3535 continue; 3536 for (unsigned Part = 0; Part < UF; ++Part) { 3537 Value *I = getOrCreateVectorValue(KV.first, Part); 3538 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3539 continue; 3540 Type *OriginalTy = I->getType(); 3541 Type *ScalarTruncatedTy = 3542 IntegerType::get(OriginalTy->getContext(), KV.second); 3543 auto *TruncatedTy = FixedVectorType::get( 3544 ScalarTruncatedTy, 3545 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3546 if (TruncatedTy == OriginalTy) 3547 continue; 3548 3549 IRBuilder<> B(cast<Instruction>(I)); 3550 auto ShrinkOperand = [&](Value *V) -> Value * { 3551 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3552 if (ZI->getSrcTy() == TruncatedTy) 3553 return ZI->getOperand(0); 3554 return B.CreateZExtOrTrunc(V, TruncatedTy); 3555 }; 3556 3557 // The actual instruction modification depends on the instruction type, 3558 // unfortunately. 3559 Value *NewI = nullptr; 3560 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3561 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3562 ShrinkOperand(BO->getOperand(1))); 3563 3564 // Any wrapping introduced by shrinking this operation shouldn't be 3565 // considered undefined behavior. So, we can't unconditionally copy 3566 // arithmetic wrapping flags to NewI. 3567 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3568 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3569 NewI = 3570 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3571 ShrinkOperand(CI->getOperand(1))); 3572 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3573 NewI = B.CreateSelect(SI->getCondition(), 3574 ShrinkOperand(SI->getTrueValue()), 3575 ShrinkOperand(SI->getFalseValue())); 3576 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3577 switch (CI->getOpcode()) { 3578 default: 3579 llvm_unreachable("Unhandled cast!"); 3580 case Instruction::Trunc: 3581 NewI = ShrinkOperand(CI->getOperand(0)); 3582 break; 3583 case Instruction::SExt: 3584 NewI = B.CreateSExtOrTrunc( 3585 CI->getOperand(0), 3586 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3587 break; 3588 case Instruction::ZExt: 3589 NewI = B.CreateZExtOrTrunc( 3590 CI->getOperand(0), 3591 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3592 break; 3593 } 3594 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3595 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3596 ->getNumElements(); 3597 auto *O0 = B.CreateZExtOrTrunc( 3598 SI->getOperand(0), 3599 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3600 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3601 ->getNumElements(); 3602 auto *O1 = B.CreateZExtOrTrunc( 3603 SI->getOperand(1), 3604 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3605 3606 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3607 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3608 // Don't do anything with the operands, just extend the result. 3609 continue; 3610 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3611 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3612 ->getNumElements(); 3613 auto *O0 = B.CreateZExtOrTrunc( 3614 IE->getOperand(0), 3615 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3616 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3617 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3618 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3619 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3620 ->getNumElements(); 3621 auto *O0 = B.CreateZExtOrTrunc( 3622 EE->getOperand(0), 3623 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3624 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3625 } else { 3626 // If we don't know what to do, be conservative and don't do anything. 3627 continue; 3628 } 3629 3630 // Lastly, extend the result. 3631 NewI->takeName(cast<Instruction>(I)); 3632 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3633 I->replaceAllUsesWith(Res); 3634 cast<Instruction>(I)->eraseFromParent(); 3635 Erased.insert(I); 3636 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3637 } 3638 } 3639 3640 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3641 for (const auto &KV : Cost->getMinimalBitwidths()) { 3642 // If the value wasn't vectorized, we must maintain the original scalar 3643 // type. The absence of the value from VectorLoopValueMap indicates that it 3644 // wasn't vectorized. 3645 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3646 continue; 3647 for (unsigned Part = 0; Part < UF; ++Part) { 3648 Value *I = getOrCreateVectorValue(KV.first, Part); 3649 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3650 if (Inst && Inst->use_empty()) { 3651 Value *NewI = Inst->getOperand(0); 3652 Inst->eraseFromParent(); 3653 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3654 } 3655 } 3656 } 3657 } 3658 3659 void InnerLoopVectorizer::fixVectorizedLoop() { 3660 // Insert truncates and extends for any truncated instructions as hints to 3661 // InstCombine. 3662 if (VF.isVector()) 3663 truncateToMinimalBitwidths(); 3664 3665 // Fix widened non-induction PHIs by setting up the PHI operands. 3666 if (OrigPHIsToFix.size()) { 3667 assert(EnableVPlanNativePath && 3668 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3669 fixNonInductionPHIs(); 3670 } 3671 3672 // At this point every instruction in the original loop is widened to a 3673 // vector form. Now we need to fix the recurrences in the loop. These PHI 3674 // nodes are currently empty because we did not want to introduce cycles. 3675 // This is the second stage of vectorizing recurrences. 3676 fixCrossIterationPHIs(); 3677 3678 // Forget the original basic block. 3679 PSE.getSE()->forgetLoop(OrigLoop); 3680 3681 // Fix-up external users of the induction variables. 3682 for (auto &Entry : Legal->getInductionVars()) 3683 fixupIVUsers(Entry.first, Entry.second, 3684 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3685 IVEndValues[Entry.first], LoopMiddleBlock); 3686 3687 fixLCSSAPHIs(); 3688 for (Instruction *PI : PredicatedInstructions) 3689 sinkScalarOperands(&*PI); 3690 3691 // Remove redundant induction instructions. 3692 cse(LoopVectorBody); 3693 3694 // Set/update profile weights for the vector and remainder loops as original 3695 // loop iterations are now distributed among them. Note that original loop 3696 // represented by LoopScalarBody becomes remainder loop after vectorization. 3697 // 3698 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3699 // end up getting slightly roughened result but that should be OK since 3700 // profile is not inherently precise anyway. Note also possible bypass of 3701 // vector code caused by legality checks is ignored, assigning all the weight 3702 // to the vector loop, optimistically. 3703 assert(!VF.isScalable() && 3704 "cannot use scalable ElementCount to determine unroll factor"); 3705 setProfileInfoAfterUnrolling( 3706 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3707 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3708 } 3709 3710 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3711 // In order to support recurrences we need to be able to vectorize Phi nodes. 3712 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3713 // stage #2: We now need to fix the recurrences by adding incoming edges to 3714 // the currently empty PHI nodes. At this point every instruction in the 3715 // original loop is widened to a vector form so we can use them to construct 3716 // the incoming edges. 3717 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3718 // Handle first-order recurrences and reductions that need to be fixed. 3719 if (Legal->isFirstOrderRecurrence(&Phi)) 3720 fixFirstOrderRecurrence(&Phi); 3721 else if (Legal->isReductionVariable(&Phi)) 3722 fixReduction(&Phi); 3723 } 3724 } 3725 3726 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3727 // This is the second phase of vectorizing first-order recurrences. An 3728 // overview of the transformation is described below. Suppose we have the 3729 // following loop. 3730 // 3731 // for (int i = 0; i < n; ++i) 3732 // b[i] = a[i] - a[i - 1]; 3733 // 3734 // There is a first-order recurrence on "a". For this loop, the shorthand 3735 // scalar IR looks like: 3736 // 3737 // scalar.ph: 3738 // s_init = a[-1] 3739 // br scalar.body 3740 // 3741 // scalar.body: 3742 // i = phi [0, scalar.ph], [i+1, scalar.body] 3743 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3744 // s2 = a[i] 3745 // b[i] = s2 - s1 3746 // br cond, scalar.body, ... 3747 // 3748 // In this example, s1 is a recurrence because it's value depends on the 3749 // previous iteration. In the first phase of vectorization, we created a 3750 // temporary value for s1. We now complete the vectorization and produce the 3751 // shorthand vector IR shown below (for VF = 4, UF = 1). 3752 // 3753 // vector.ph: 3754 // v_init = vector(..., ..., ..., a[-1]) 3755 // br vector.body 3756 // 3757 // vector.body 3758 // i = phi [0, vector.ph], [i+4, vector.body] 3759 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3760 // v2 = a[i, i+1, i+2, i+3]; 3761 // v3 = vector(v1(3), v2(0, 1, 2)) 3762 // b[i, i+1, i+2, i+3] = v2 - v3 3763 // br cond, vector.body, middle.block 3764 // 3765 // middle.block: 3766 // x = v2(3) 3767 // br scalar.ph 3768 // 3769 // scalar.ph: 3770 // s_init = phi [x, middle.block], [a[-1], otherwise] 3771 // br scalar.body 3772 // 3773 // After execution completes the vector loop, we extract the next value of 3774 // the recurrence (x) to use as the initial value in the scalar loop. 3775 3776 // Get the original loop preheader and single loop latch. 3777 auto *Preheader = OrigLoop->getLoopPreheader(); 3778 auto *Latch = OrigLoop->getLoopLatch(); 3779 3780 // Get the initial and previous values of the scalar recurrence. 3781 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3782 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3783 3784 // Create a vector from the initial value. 3785 auto *VectorInit = ScalarInit; 3786 if (VF.isVector()) { 3787 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3788 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3789 VectorInit = Builder.CreateInsertElement( 3790 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3791 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3792 } 3793 3794 // We constructed a temporary phi node in the first phase of vectorization. 3795 // This phi node will eventually be deleted. 3796 Builder.SetInsertPoint( 3797 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3798 3799 // Create a phi node for the new recurrence. The current value will either be 3800 // the initial value inserted into a vector or loop-varying vector value. 3801 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3802 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3803 3804 // Get the vectorized previous value of the last part UF - 1. It appears last 3805 // among all unrolled iterations, due to the order of their construction. 3806 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3807 3808 // Find and set the insertion point after the previous value if it is an 3809 // instruction. 3810 BasicBlock::iterator InsertPt; 3811 // Note that the previous value may have been constant-folded so it is not 3812 // guaranteed to be an instruction in the vector loop. 3813 // FIXME: Loop invariant values do not form recurrences. We should deal with 3814 // them earlier. 3815 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3816 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3817 else { 3818 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3819 if (isa<PHINode>(PreviousLastPart)) 3820 // If the previous value is a phi node, we should insert after all the phi 3821 // nodes in the block containing the PHI to avoid breaking basic block 3822 // verification. Note that the basic block may be different to 3823 // LoopVectorBody, in case we predicate the loop. 3824 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3825 else 3826 InsertPt = ++PreviousInst->getIterator(); 3827 } 3828 Builder.SetInsertPoint(&*InsertPt); 3829 3830 // We will construct a vector for the recurrence by combining the values for 3831 // the current and previous iterations. This is the required shuffle mask. 3832 assert(!VF.isScalable()); 3833 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3834 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3835 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3836 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3837 3838 // The vector from which to take the initial value for the current iteration 3839 // (actual or unrolled). Initially, this is the vector phi node. 3840 Value *Incoming = VecPhi; 3841 3842 // Shuffle the current and previous vector and update the vector parts. 3843 for (unsigned Part = 0; Part < UF; ++Part) { 3844 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3845 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3846 auto *Shuffle = 3847 VF.isVector() 3848 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3849 : Incoming; 3850 PhiPart->replaceAllUsesWith(Shuffle); 3851 cast<Instruction>(PhiPart)->eraseFromParent(); 3852 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3853 Incoming = PreviousPart; 3854 } 3855 3856 // Fix the latch value of the new recurrence in the vector loop. 3857 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3858 3859 // Extract the last vector element in the middle block. This will be the 3860 // initial value for the recurrence when jumping to the scalar loop. 3861 auto *ExtractForScalar = Incoming; 3862 if (VF.isVector()) { 3863 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3864 ExtractForScalar = Builder.CreateExtractElement( 3865 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3866 "vector.recur.extract"); 3867 } 3868 // Extract the second last element in the middle block if the 3869 // Phi is used outside the loop. We need to extract the phi itself 3870 // and not the last element (the phi update in the current iteration). This 3871 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3872 // when the scalar loop is not run at all. 3873 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3874 if (VF.isVector()) 3875 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3876 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3877 "vector.recur.extract.for.phi"); 3878 // When loop is unrolled without vectorizing, initialize 3879 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3880 // `Incoming`. This is analogous to the vectorized case above: extracting the 3881 // second last element when VF > 1. 3882 else if (UF > 1) 3883 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3884 3885 // Fix the initial value of the original recurrence in the scalar loop. 3886 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3887 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3888 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3889 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3890 Start->addIncoming(Incoming, BB); 3891 } 3892 3893 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3894 Phi->setName("scalar.recur"); 3895 3896 // Finally, fix users of the recurrence outside the loop. The users will need 3897 // either the last value of the scalar recurrence or the last value of the 3898 // vector recurrence we extracted in the middle block. Since the loop is in 3899 // LCSSA form, we just need to find all the phi nodes for the original scalar 3900 // recurrence in the exit block, and then add an edge for the middle block. 3901 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3902 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3903 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3904 } 3905 } 3906 } 3907 3908 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3909 Constant *Zero = Builder.getInt32(0); 3910 3911 // Get it's reduction variable descriptor. 3912 assert(Legal->isReductionVariable(Phi) && 3913 "Unable to find the reduction variable"); 3914 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3915 3916 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3917 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3918 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3919 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3920 RdxDesc.getMinMaxRecurrenceKind(); 3921 setDebugLocFromInst(Builder, ReductionStartValue); 3922 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3923 3924 // We need to generate a reduction vector from the incoming scalar. 3925 // To do so, we need to generate the 'identity' vector and override 3926 // one of the elements with the incoming scalar reduction. We need 3927 // to do it in the vector-loop preheader. 3928 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3929 3930 // This is the vector-clone of the value that leaves the loop. 3931 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3932 3933 // Find the reduction identity variable. Zero for addition, or, xor, 3934 // one for multiplication, -1 for And. 3935 Value *Identity; 3936 Value *VectorStart; 3937 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3938 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3939 // MinMax reduction have the start value as their identify. 3940 if (VF == 1 || IsInLoopReductionPhi) { 3941 VectorStart = Identity = ReductionStartValue; 3942 } else { 3943 VectorStart = Identity = 3944 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3945 } 3946 } else { 3947 // Handle other reduction kinds: 3948 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3949 RK, VecTy->getScalarType()); 3950 if (VF == 1 || IsInLoopReductionPhi) { 3951 Identity = Iden; 3952 // This vector is the Identity vector where the first element is the 3953 // incoming scalar reduction. 3954 VectorStart = ReductionStartValue; 3955 } else { 3956 Identity = ConstantVector::getSplat(VF, Iden); 3957 3958 // This vector is the Identity vector where the first element is the 3959 // incoming scalar reduction. 3960 VectorStart = 3961 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3962 } 3963 } 3964 3965 // Wrap flags are in general invalid after vectorization, clear them. 3966 clearReductionWrapFlags(RdxDesc); 3967 3968 // Fix the vector-loop phi. 3969 3970 // Reductions do not have to start at zero. They can start with 3971 // any loop invariant values. 3972 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3973 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3974 3975 for (unsigned Part = 0; Part < UF; ++Part) { 3976 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3977 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3978 // Make sure to add the reduction start value only to the 3979 // first unroll part. 3980 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3981 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3982 cast<PHINode>(VecRdxPhi) 3983 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3984 } 3985 3986 // Before each round, move the insertion point right between 3987 // the PHIs and the values we are going to write. 3988 // This allows us to write both PHINodes and the extractelement 3989 // instructions. 3990 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3991 3992 setDebugLocFromInst(Builder, LoopExitInst); 3993 3994 // If tail is folded by masking, the vector value to leave the loop should be 3995 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3996 // instead of the former. 3997 if (Cost->foldTailByMasking()) { 3998 for (unsigned Part = 0; Part < UF; ++Part) { 3999 Value *VecLoopExitInst = 4000 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4001 Value *Sel = nullptr; 4002 for (User *U : VecLoopExitInst->users()) { 4003 if (isa<SelectInst>(U)) { 4004 assert(!Sel && "Reduction exit feeding two selects"); 4005 Sel = U; 4006 } else 4007 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4008 } 4009 assert(Sel && "Reduction exit feeds no select"); 4010 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4011 4012 // If the target can create a predicated operator for the reduction at no 4013 // extra cost in the loop (for example a predicated vadd), it can be 4014 // cheaper for the select to remain in the loop than be sunk out of it, 4015 // and so use the select value for the phi instead of the old 4016 // LoopExitValue. 4017 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4018 if (PreferPredicatedReductionSelect || 4019 TTI->preferPredicatedReductionSelect( 4020 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4021 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4022 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4023 VecRdxPhi->setIncomingValueForBlock( 4024 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4025 } 4026 } 4027 } 4028 4029 // If the vector reduction can be performed in a smaller type, we truncate 4030 // then extend the loop exit value to enable InstCombine to evaluate the 4031 // entire expression in the smaller type. 4032 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4033 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4034 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4035 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4036 Builder.SetInsertPoint( 4037 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4038 VectorParts RdxParts(UF); 4039 for (unsigned Part = 0; Part < UF; ++Part) { 4040 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4041 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4042 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4043 : Builder.CreateZExt(Trunc, VecTy); 4044 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4045 UI != RdxParts[Part]->user_end();) 4046 if (*UI != Trunc) { 4047 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4048 RdxParts[Part] = Extnd; 4049 } else { 4050 ++UI; 4051 } 4052 } 4053 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4054 for (unsigned Part = 0; Part < UF; ++Part) { 4055 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4056 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4057 } 4058 } 4059 4060 // Reduce all of the unrolled parts into a single vector. 4061 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4062 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4063 4064 // The middle block terminator has already been assigned a DebugLoc here (the 4065 // OrigLoop's single latch terminator). We want the whole middle block to 4066 // appear to execute on this line because: (a) it is all compiler generated, 4067 // (b) these instructions are always executed after evaluating the latch 4068 // conditional branch, and (c) other passes may add new predecessors which 4069 // terminate on this line. This is the easiest way to ensure we don't 4070 // accidentally cause an extra step back into the loop while debugging. 4071 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4072 for (unsigned Part = 1; Part < UF; ++Part) { 4073 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4074 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4075 // Floating point operations had to be 'fast' to enable the reduction. 4076 ReducedPartRdx = addFastMathFlag( 4077 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4078 ReducedPartRdx, "bin.rdx"), 4079 RdxDesc.getFastMathFlags()); 4080 else 4081 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4082 RdxPart); 4083 } 4084 4085 // Create the reduction after the loop. Note that inloop reductions create the 4086 // target reduction in the loop using a Reduction recipe. 4087 if (VF.isVector() && !IsInLoopReductionPhi) { 4088 bool NoNaN = Legal->hasFunNoNaNAttr(); 4089 ReducedPartRdx = 4090 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4091 // If the reduction can be performed in a smaller type, we need to extend 4092 // the reduction to the wider type before we branch to the original loop. 4093 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4094 ReducedPartRdx = 4095 RdxDesc.isSigned() 4096 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4097 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4098 } 4099 4100 // Create a phi node that merges control-flow from the backedge-taken check 4101 // block and the middle block. 4102 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4103 LoopScalarPreHeader->getTerminator()); 4104 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4105 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4106 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4107 4108 // Now, we need to fix the users of the reduction variable 4109 // inside and outside of the scalar remainder loop. 4110 // We know that the loop is in LCSSA form. We need to update the 4111 // PHI nodes in the exit blocks. 4112 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4113 // All PHINodes need to have a single entry edge, or two if 4114 // we already fixed them. 4115 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4116 4117 // We found a reduction value exit-PHI. Update it with the 4118 // incoming bypass edge. 4119 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4120 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4121 } // end of the LCSSA phi scan. 4122 4123 // Fix the scalar loop reduction variable with the incoming reduction sum 4124 // from the vector body and from the backedge value. 4125 int IncomingEdgeBlockIdx = 4126 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4127 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4128 // Pick the other block. 4129 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4130 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4131 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4132 } 4133 4134 void InnerLoopVectorizer::clearReductionWrapFlags( 4135 RecurrenceDescriptor &RdxDesc) { 4136 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4137 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4138 RK != RecurrenceDescriptor::RK_IntegerMult) 4139 return; 4140 4141 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4142 assert(LoopExitInstr && "null loop exit instruction"); 4143 SmallVector<Instruction *, 8> Worklist; 4144 SmallPtrSet<Instruction *, 8> Visited; 4145 Worklist.push_back(LoopExitInstr); 4146 Visited.insert(LoopExitInstr); 4147 4148 while (!Worklist.empty()) { 4149 Instruction *Cur = Worklist.pop_back_val(); 4150 if (isa<OverflowingBinaryOperator>(Cur)) 4151 for (unsigned Part = 0; Part < UF; ++Part) { 4152 Value *V = getOrCreateVectorValue(Cur, Part); 4153 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4154 } 4155 4156 for (User *U : Cur->users()) { 4157 Instruction *UI = cast<Instruction>(U); 4158 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4159 Visited.insert(UI).second) 4160 Worklist.push_back(UI); 4161 } 4162 } 4163 } 4164 4165 void InnerLoopVectorizer::fixLCSSAPHIs() { 4166 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4167 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4168 if (LCSSAPhi.getNumIncomingValues() == 1) { 4169 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4170 // Non-instruction incoming values will have only one value. 4171 unsigned LastLane = 0; 4172 if (isa<Instruction>(IncomingValue)) 4173 LastLane = Cost->isUniformAfterVectorization( 4174 cast<Instruction>(IncomingValue), VF) 4175 ? 0 4176 : VF.getKnownMinValue() - 1; 4177 // Can be a loop invariant incoming value or the last scalar value to be 4178 // extracted from the vectorized loop. 4179 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4180 Value *lastIncomingValue = 4181 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4182 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4183 } 4184 } 4185 } 4186 4187 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4188 // The basic block and loop containing the predicated instruction. 4189 auto *PredBB = PredInst->getParent(); 4190 auto *VectorLoop = LI->getLoopFor(PredBB); 4191 4192 // Initialize a worklist with the operands of the predicated instruction. 4193 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4194 4195 // Holds instructions that we need to analyze again. An instruction may be 4196 // reanalyzed if we don't yet know if we can sink it or not. 4197 SmallVector<Instruction *, 8> InstsToReanalyze; 4198 4199 // Returns true if a given use occurs in the predicated block. Phi nodes use 4200 // their operands in their corresponding predecessor blocks. 4201 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4202 auto *I = cast<Instruction>(U.getUser()); 4203 BasicBlock *BB = I->getParent(); 4204 if (auto *Phi = dyn_cast<PHINode>(I)) 4205 BB = Phi->getIncomingBlock( 4206 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4207 return BB == PredBB; 4208 }; 4209 4210 // Iteratively sink the scalarized operands of the predicated instruction 4211 // into the block we created for it. When an instruction is sunk, it's 4212 // operands are then added to the worklist. The algorithm ends after one pass 4213 // through the worklist doesn't sink a single instruction. 4214 bool Changed; 4215 do { 4216 // Add the instructions that need to be reanalyzed to the worklist, and 4217 // reset the changed indicator. 4218 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4219 InstsToReanalyze.clear(); 4220 Changed = false; 4221 4222 while (!Worklist.empty()) { 4223 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4224 4225 // We can't sink an instruction if it is a phi node, is already in the 4226 // predicated block, is not in the loop, or may have side effects. 4227 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4228 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4229 continue; 4230 4231 // It's legal to sink the instruction if all its uses occur in the 4232 // predicated block. Otherwise, there's nothing to do yet, and we may 4233 // need to reanalyze the instruction. 4234 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4235 InstsToReanalyze.push_back(I); 4236 continue; 4237 } 4238 4239 // Move the instruction to the beginning of the predicated block, and add 4240 // it's operands to the worklist. 4241 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4242 Worklist.insert(I->op_begin(), I->op_end()); 4243 4244 // The sinking may have enabled other instructions to be sunk, so we will 4245 // need to iterate. 4246 Changed = true; 4247 } 4248 } while (Changed); 4249 } 4250 4251 void InnerLoopVectorizer::fixNonInductionPHIs() { 4252 for (PHINode *OrigPhi : OrigPHIsToFix) { 4253 PHINode *NewPhi = 4254 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4255 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4256 4257 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4258 predecessors(OrigPhi->getParent())); 4259 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4260 predecessors(NewPhi->getParent())); 4261 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4262 "Scalar and Vector BB should have the same number of predecessors"); 4263 4264 // The insertion point in Builder may be invalidated by the time we get 4265 // here. Force the Builder insertion point to something valid so that we do 4266 // not run into issues during insertion point restore in 4267 // getOrCreateVectorValue calls below. 4268 Builder.SetInsertPoint(NewPhi); 4269 4270 // The predecessor order is preserved and we can rely on mapping between 4271 // scalar and vector block predecessors. 4272 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4273 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4274 4275 // When looking up the new scalar/vector values to fix up, use incoming 4276 // values from original phi. 4277 Value *ScIncV = 4278 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4279 4280 // Scalar incoming value may need a broadcast 4281 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4282 NewPhi->addIncoming(NewIncV, NewPredBB); 4283 } 4284 } 4285 } 4286 4287 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4288 unsigned UF, ElementCount VF, 4289 bool IsPtrLoopInvariant, 4290 SmallBitVector &IsIndexLoopInvariant, 4291 VPTransformState &State) { 4292 // Construct a vector GEP by widening the operands of the scalar GEP as 4293 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4294 // results in a vector of pointers when at least one operand of the GEP 4295 // is vector-typed. Thus, to keep the representation compact, we only use 4296 // vector-typed operands for loop-varying values. 4297 4298 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4299 // If we are vectorizing, but the GEP has only loop-invariant operands, 4300 // the GEP we build (by only using vector-typed operands for 4301 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4302 // produce a vector of pointers, we need to either arbitrarily pick an 4303 // operand to broadcast, or broadcast a clone of the original GEP. 4304 // Here, we broadcast a clone of the original. 4305 // 4306 // TODO: If at some point we decide to scalarize instructions having 4307 // loop-invariant operands, this special case will no longer be 4308 // required. We would add the scalarization decision to 4309 // collectLoopScalars() and teach getVectorValue() to broadcast 4310 // the lane-zero scalar value. 4311 auto *Clone = Builder.Insert(GEP->clone()); 4312 for (unsigned Part = 0; Part < UF; ++Part) { 4313 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4314 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4315 addMetadata(EntryPart, GEP); 4316 } 4317 } else { 4318 // If the GEP has at least one loop-varying operand, we are sure to 4319 // produce a vector of pointers. But if we are only unrolling, we want 4320 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4321 // produce with the code below will be scalar (if VF == 1) or vector 4322 // (otherwise). Note that for the unroll-only case, we still maintain 4323 // values in the vector mapping with initVector, as we do for other 4324 // instructions. 4325 for (unsigned Part = 0; Part < UF; ++Part) { 4326 // The pointer operand of the new GEP. If it's loop-invariant, we 4327 // won't broadcast it. 4328 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4329 : State.get(Operands.getOperand(0), Part); 4330 4331 // Collect all the indices for the new GEP. If any index is 4332 // loop-invariant, we won't broadcast it. 4333 SmallVector<Value *, 4> Indices; 4334 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4335 VPValue *Operand = Operands.getOperand(I); 4336 if (IsIndexLoopInvariant[I - 1]) 4337 Indices.push_back(State.get(Operand, {0, 0})); 4338 else 4339 Indices.push_back(State.get(Operand, Part)); 4340 } 4341 4342 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4343 // but it should be a vector, otherwise. 4344 auto *NewGEP = 4345 GEP->isInBounds() 4346 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4347 Indices) 4348 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4349 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4350 "NewGEP is not a pointer vector"); 4351 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4352 addMetadata(NewGEP, GEP); 4353 } 4354 } 4355 } 4356 4357 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4358 ElementCount VF) { 4359 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4360 PHINode *P = cast<PHINode>(PN); 4361 if (EnableVPlanNativePath) { 4362 // Currently we enter here in the VPlan-native path for non-induction 4363 // PHIs where all control flow is uniform. We simply widen these PHIs. 4364 // Create a vector phi with no operands - the vector phi operands will be 4365 // set at the end of vector code generation. 4366 Type *VecTy = 4367 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4368 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4369 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4370 OrigPHIsToFix.push_back(P); 4371 4372 return; 4373 } 4374 4375 assert(PN->getParent() == OrigLoop->getHeader() && 4376 "Non-header phis should have been handled elsewhere"); 4377 4378 // In order to support recurrences we need to be able to vectorize Phi nodes. 4379 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4380 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4381 // this value when we vectorize all of the instructions that use the PHI. 4382 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4383 for (unsigned Part = 0; Part < UF; ++Part) { 4384 // This is phase one of vectorizing PHIs. 4385 bool ScalarPHI = 4386 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4387 Type *VecTy = 4388 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4389 Value *EntryPart = PHINode::Create( 4390 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4391 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4392 } 4393 return; 4394 } 4395 4396 setDebugLocFromInst(Builder, P); 4397 4398 // This PHINode must be an induction variable. 4399 // Make sure that we know about it. 4400 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4401 4402 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4403 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4404 4405 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4406 // which can be found from the original scalar operations. 4407 switch (II.getKind()) { 4408 case InductionDescriptor::IK_NoInduction: 4409 llvm_unreachable("Unknown induction"); 4410 case InductionDescriptor::IK_IntInduction: 4411 case InductionDescriptor::IK_FpInduction: 4412 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4413 case InductionDescriptor::IK_PtrInduction: { 4414 // Handle the pointer induction variable case. 4415 assert(P->getType()->isPointerTy() && "Unexpected type."); 4416 4417 if (Cost->isScalarAfterVectorization(P, VF)) { 4418 // This is the normalized GEP that starts counting at zero. 4419 Value *PtrInd = 4420 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4421 // Determine the number of scalars we need to generate for each unroll 4422 // iteration. If the instruction is uniform, we only need to generate the 4423 // first lane. Otherwise, we generate all VF values. 4424 unsigned Lanes = 4425 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4426 for (unsigned Part = 0; Part < UF; ++Part) { 4427 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4428 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4429 Lane + Part * VF.getKnownMinValue()); 4430 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4431 Value *SclrGep = 4432 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4433 SclrGep->setName("next.gep"); 4434 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4435 } 4436 } 4437 return; 4438 } 4439 assert(isa<SCEVConstant>(II.getStep()) && 4440 "Induction step not a SCEV constant!"); 4441 Type *PhiType = II.getStep()->getType(); 4442 4443 // Build a pointer phi 4444 Value *ScalarStartValue = II.getStartValue(); 4445 Type *ScStValueType = ScalarStartValue->getType(); 4446 PHINode *NewPointerPhi = 4447 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4448 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4449 4450 // A pointer induction, performed by using a gep 4451 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4452 Instruction *InductionLoc = LoopLatch->getTerminator(); 4453 const SCEV *ScalarStep = II.getStep(); 4454 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4455 Value *ScalarStepValue = 4456 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4457 Value *InductionGEP = GetElementPtrInst::Create( 4458 ScStValueType->getPointerElementType(), NewPointerPhi, 4459 Builder.CreateMul( 4460 ScalarStepValue, 4461 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4462 "ptr.ind", InductionLoc); 4463 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4464 4465 // Create UF many actual address geps that use the pointer 4466 // phi as base and a vectorized version of the step value 4467 // (<step*0, ..., step*N>) as offset. 4468 for (unsigned Part = 0; Part < UF; ++Part) { 4469 SmallVector<Constant *, 8> Indices; 4470 // Create a vector of consecutive numbers from zero to VF. 4471 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4472 Indices.push_back( 4473 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4474 Constant *StartOffset = ConstantVector::get(Indices); 4475 4476 Value *GEP = Builder.CreateGEP( 4477 ScStValueType->getPointerElementType(), NewPointerPhi, 4478 Builder.CreateMul( 4479 StartOffset, 4480 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4481 "vector.gep")); 4482 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4483 } 4484 } 4485 } 4486 } 4487 4488 /// A helper function for checking whether an integer division-related 4489 /// instruction may divide by zero (in which case it must be predicated if 4490 /// executed conditionally in the scalar code). 4491 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4492 /// Non-zero divisors that are non compile-time constants will not be 4493 /// converted into multiplication, so we will still end up scalarizing 4494 /// the division, but can do so w/o predication. 4495 static bool mayDivideByZero(Instruction &I) { 4496 assert((I.getOpcode() == Instruction::UDiv || 4497 I.getOpcode() == Instruction::SDiv || 4498 I.getOpcode() == Instruction::URem || 4499 I.getOpcode() == Instruction::SRem) && 4500 "Unexpected instruction"); 4501 Value *Divisor = I.getOperand(1); 4502 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4503 return !CInt || CInt->isZero(); 4504 } 4505 4506 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4507 VPTransformState &State) { 4508 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4509 switch (I.getOpcode()) { 4510 case Instruction::Call: 4511 case Instruction::Br: 4512 case Instruction::PHI: 4513 case Instruction::GetElementPtr: 4514 case Instruction::Select: 4515 llvm_unreachable("This instruction is handled by a different recipe."); 4516 case Instruction::UDiv: 4517 case Instruction::SDiv: 4518 case Instruction::SRem: 4519 case Instruction::URem: 4520 case Instruction::Add: 4521 case Instruction::FAdd: 4522 case Instruction::Sub: 4523 case Instruction::FSub: 4524 case Instruction::FNeg: 4525 case Instruction::Mul: 4526 case Instruction::FMul: 4527 case Instruction::FDiv: 4528 case Instruction::FRem: 4529 case Instruction::Shl: 4530 case Instruction::LShr: 4531 case Instruction::AShr: 4532 case Instruction::And: 4533 case Instruction::Or: 4534 case Instruction::Xor: { 4535 // Just widen unops and binops. 4536 setDebugLocFromInst(Builder, &I); 4537 4538 for (unsigned Part = 0; Part < UF; ++Part) { 4539 SmallVector<Value *, 2> Ops; 4540 for (VPValue *VPOp : User.operands()) 4541 Ops.push_back(State.get(VPOp, Part)); 4542 4543 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4544 4545 if (auto *VecOp = dyn_cast<Instruction>(V)) 4546 VecOp->copyIRFlags(&I); 4547 4548 // Use this vector value for all users of the original instruction. 4549 VectorLoopValueMap.setVectorValue(&I, Part, V); 4550 addMetadata(V, &I); 4551 } 4552 4553 break; 4554 } 4555 case Instruction::ICmp: 4556 case Instruction::FCmp: { 4557 // Widen compares. Generate vector compares. 4558 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4559 auto *Cmp = cast<CmpInst>(&I); 4560 setDebugLocFromInst(Builder, Cmp); 4561 for (unsigned Part = 0; Part < UF; ++Part) { 4562 Value *A = State.get(User.getOperand(0), Part); 4563 Value *B = State.get(User.getOperand(1), Part); 4564 Value *C = nullptr; 4565 if (FCmp) { 4566 // Propagate fast math flags. 4567 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4568 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4569 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4570 } else { 4571 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4572 } 4573 VectorLoopValueMap.setVectorValue(&I, Part, C); 4574 addMetadata(C, &I); 4575 } 4576 4577 break; 4578 } 4579 4580 case Instruction::ZExt: 4581 case Instruction::SExt: 4582 case Instruction::FPToUI: 4583 case Instruction::FPToSI: 4584 case Instruction::FPExt: 4585 case Instruction::PtrToInt: 4586 case Instruction::IntToPtr: 4587 case Instruction::SIToFP: 4588 case Instruction::UIToFP: 4589 case Instruction::Trunc: 4590 case Instruction::FPTrunc: 4591 case Instruction::BitCast: { 4592 auto *CI = cast<CastInst>(&I); 4593 setDebugLocFromInst(Builder, CI); 4594 4595 /// Vectorize casts. 4596 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4597 Type *DestTy = 4598 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4599 4600 for (unsigned Part = 0; Part < UF; ++Part) { 4601 Value *A = State.get(User.getOperand(0), Part); 4602 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4603 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4604 addMetadata(Cast, &I); 4605 } 4606 break; 4607 } 4608 default: 4609 // This instruction is not vectorized by simple widening. 4610 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4611 llvm_unreachable("Unhandled instruction!"); 4612 } // end of switch. 4613 } 4614 4615 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4616 VPTransformState &State) { 4617 assert(!isa<DbgInfoIntrinsic>(I) && 4618 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4619 setDebugLocFromInst(Builder, &I); 4620 4621 Module *M = I.getParent()->getParent()->getParent(); 4622 auto *CI = cast<CallInst>(&I); 4623 4624 SmallVector<Type *, 4> Tys; 4625 for (Value *ArgOperand : CI->arg_operands()) 4626 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4627 4628 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4629 4630 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4631 // version of the instruction. 4632 // Is it beneficial to perform intrinsic call compared to lib call? 4633 bool NeedToScalarize = false; 4634 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4635 bool UseVectorIntrinsic = 4636 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4637 assert((UseVectorIntrinsic || !NeedToScalarize) && 4638 "Instruction should be scalarized elsewhere."); 4639 4640 for (unsigned Part = 0; Part < UF; ++Part) { 4641 SmallVector<Value *, 4> Args; 4642 for (auto &I : enumerate(ArgOperands.operands())) { 4643 // Some intrinsics have a scalar argument - don't replace it with a 4644 // vector. 4645 Value *Arg; 4646 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4647 Arg = State.get(I.value(), Part); 4648 else 4649 Arg = State.get(I.value(), {0, 0}); 4650 Args.push_back(Arg); 4651 } 4652 4653 Function *VectorF; 4654 if (UseVectorIntrinsic) { 4655 // Use vector version of the intrinsic. 4656 Type *TysForDecl[] = {CI->getType()}; 4657 if (VF.isVector()) { 4658 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4659 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4660 } 4661 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4662 assert(VectorF && "Can't retrieve vector intrinsic."); 4663 } else { 4664 // Use vector version of the function call. 4665 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4666 #ifndef NDEBUG 4667 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4668 "Can't create vector function."); 4669 #endif 4670 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4671 } 4672 SmallVector<OperandBundleDef, 1> OpBundles; 4673 CI->getOperandBundlesAsDefs(OpBundles); 4674 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4675 4676 if (isa<FPMathOperator>(V)) 4677 V->copyFastMathFlags(CI); 4678 4679 VectorLoopValueMap.setVectorValue(&I, Part, V); 4680 addMetadata(V, &I); 4681 } 4682 } 4683 4684 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4685 VPUser &Operands, 4686 bool InvariantCond, 4687 VPTransformState &State) { 4688 setDebugLocFromInst(Builder, &I); 4689 4690 // The condition can be loop invariant but still defined inside the 4691 // loop. This means that we can't just use the original 'cond' value. 4692 // We have to take the 'vectorized' value and pick the first lane. 4693 // Instcombine will make this a no-op. 4694 auto *InvarCond = 4695 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4696 4697 for (unsigned Part = 0; Part < UF; ++Part) { 4698 Value *Cond = 4699 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4700 Value *Op0 = State.get(Operands.getOperand(1), Part); 4701 Value *Op1 = State.get(Operands.getOperand(2), Part); 4702 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4703 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4704 addMetadata(Sel, &I); 4705 } 4706 } 4707 4708 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4709 // We should not collect Scalars more than once per VF. Right now, this 4710 // function is called from collectUniformsAndScalars(), which already does 4711 // this check. Collecting Scalars for VF=1 does not make any sense. 4712 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4713 "This function should not be visited twice for the same VF"); 4714 4715 SmallSetVector<Instruction *, 8> Worklist; 4716 4717 // These sets are used to seed the analysis with pointers used by memory 4718 // accesses that will remain scalar. 4719 SmallSetVector<Instruction *, 8> ScalarPtrs; 4720 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4721 auto *Latch = TheLoop->getLoopLatch(); 4722 4723 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4724 // The pointer operands of loads and stores will be scalar as long as the 4725 // memory access is not a gather or scatter operation. The value operand of a 4726 // store will remain scalar if the store is scalarized. 4727 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4728 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4729 assert(WideningDecision != CM_Unknown && 4730 "Widening decision should be ready at this moment"); 4731 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4732 if (Ptr == Store->getValueOperand()) 4733 return WideningDecision == CM_Scalarize; 4734 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4735 "Ptr is neither a value or pointer operand"); 4736 return WideningDecision != CM_GatherScatter; 4737 }; 4738 4739 // A helper that returns true if the given value is a bitcast or 4740 // getelementptr instruction contained in the loop. 4741 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4742 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4743 isa<GetElementPtrInst>(V)) && 4744 !TheLoop->isLoopInvariant(V); 4745 }; 4746 4747 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4748 if (!isa<PHINode>(Ptr) || 4749 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4750 return false; 4751 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4752 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4753 return false; 4754 return isScalarUse(MemAccess, Ptr); 4755 }; 4756 4757 // A helper that evaluates a memory access's use of a pointer. If the 4758 // pointer is actually the pointer induction of a loop, it is being 4759 // inserted into Worklist. If the use will be a scalar use, and the 4760 // pointer is only used by memory accesses, we place the pointer in 4761 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4762 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4763 if (isScalarPtrInduction(MemAccess, Ptr)) { 4764 Worklist.insert(cast<Instruction>(Ptr)); 4765 Instruction *Update = cast<Instruction>( 4766 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4767 Worklist.insert(Update); 4768 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4769 << "\n"); 4770 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4771 << "\n"); 4772 return; 4773 } 4774 // We only care about bitcast and getelementptr instructions contained in 4775 // the loop. 4776 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4777 return; 4778 4779 // If the pointer has already been identified as scalar (e.g., if it was 4780 // also identified as uniform), there's nothing to do. 4781 auto *I = cast<Instruction>(Ptr); 4782 if (Worklist.count(I)) 4783 return; 4784 4785 // If the use of the pointer will be a scalar use, and all users of the 4786 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4787 // place the pointer in PossibleNonScalarPtrs. 4788 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4789 return isa<LoadInst>(U) || isa<StoreInst>(U); 4790 })) 4791 ScalarPtrs.insert(I); 4792 else 4793 PossibleNonScalarPtrs.insert(I); 4794 }; 4795 4796 // We seed the scalars analysis with three classes of instructions: (1) 4797 // instructions marked uniform-after-vectorization and (2) bitcast, 4798 // getelementptr and (pointer) phi instructions used by memory accesses 4799 // requiring a scalar use. 4800 // 4801 // (1) Add to the worklist all instructions that have been identified as 4802 // uniform-after-vectorization. 4803 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4804 4805 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4806 // memory accesses requiring a scalar use. The pointer operands of loads and 4807 // stores will be scalar as long as the memory accesses is not a gather or 4808 // scatter operation. The value operand of a store will remain scalar if the 4809 // store is scalarized. 4810 for (auto *BB : TheLoop->blocks()) 4811 for (auto &I : *BB) { 4812 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4813 evaluatePtrUse(Load, Load->getPointerOperand()); 4814 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4815 evaluatePtrUse(Store, Store->getPointerOperand()); 4816 evaluatePtrUse(Store, Store->getValueOperand()); 4817 } 4818 } 4819 for (auto *I : ScalarPtrs) 4820 if (!PossibleNonScalarPtrs.count(I)) { 4821 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4822 Worklist.insert(I); 4823 } 4824 4825 // Insert the forced scalars. 4826 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4827 // induction variable when the PHI user is scalarized. 4828 auto ForcedScalar = ForcedScalars.find(VF); 4829 if (ForcedScalar != ForcedScalars.end()) 4830 for (auto *I : ForcedScalar->second) 4831 Worklist.insert(I); 4832 4833 // Expand the worklist by looking through any bitcasts and getelementptr 4834 // instructions we've already identified as scalar. This is similar to the 4835 // expansion step in collectLoopUniforms(); however, here we're only 4836 // expanding to include additional bitcasts and getelementptr instructions. 4837 unsigned Idx = 0; 4838 while (Idx != Worklist.size()) { 4839 Instruction *Dst = Worklist[Idx++]; 4840 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4841 continue; 4842 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4843 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4844 auto *J = cast<Instruction>(U); 4845 return !TheLoop->contains(J) || Worklist.count(J) || 4846 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4847 isScalarUse(J, Src)); 4848 })) { 4849 Worklist.insert(Src); 4850 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4851 } 4852 } 4853 4854 // An induction variable will remain scalar if all users of the induction 4855 // variable and induction variable update remain scalar. 4856 for (auto &Induction : Legal->getInductionVars()) { 4857 auto *Ind = Induction.first; 4858 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4859 4860 // If tail-folding is applied, the primary induction variable will be used 4861 // to feed a vector compare. 4862 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4863 continue; 4864 4865 // Determine if all users of the induction variable are scalar after 4866 // vectorization. 4867 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4868 auto *I = cast<Instruction>(U); 4869 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4870 }); 4871 if (!ScalarInd) 4872 continue; 4873 4874 // Determine if all users of the induction variable update instruction are 4875 // scalar after vectorization. 4876 auto ScalarIndUpdate = 4877 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4878 auto *I = cast<Instruction>(U); 4879 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4880 }); 4881 if (!ScalarIndUpdate) 4882 continue; 4883 4884 // The induction variable and its update instruction will remain scalar. 4885 Worklist.insert(Ind); 4886 Worklist.insert(IndUpdate); 4887 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4888 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4889 << "\n"); 4890 } 4891 4892 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4893 } 4894 4895 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4896 ElementCount VF) { 4897 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4898 if (!blockNeedsPredication(I->getParent())) 4899 return false; 4900 switch(I->getOpcode()) { 4901 default: 4902 break; 4903 case Instruction::Load: 4904 case Instruction::Store: { 4905 if (!Legal->isMaskRequired(I)) 4906 return false; 4907 auto *Ptr = getLoadStorePointerOperand(I); 4908 auto *Ty = getMemInstValueType(I); 4909 // We have already decided how to vectorize this instruction, get that 4910 // result. 4911 if (VF.isVector()) { 4912 InstWidening WideningDecision = getWideningDecision(I, VF); 4913 assert(WideningDecision != CM_Unknown && 4914 "Widening decision should be ready at this moment"); 4915 return WideningDecision == CM_Scalarize; 4916 } 4917 const Align Alignment = getLoadStoreAlignment(I); 4918 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4919 isLegalMaskedGather(Ty, Alignment)) 4920 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4921 isLegalMaskedScatter(Ty, Alignment)); 4922 } 4923 case Instruction::UDiv: 4924 case Instruction::SDiv: 4925 case Instruction::SRem: 4926 case Instruction::URem: 4927 return mayDivideByZero(*I); 4928 } 4929 return false; 4930 } 4931 4932 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4933 Instruction *I, ElementCount VF) { 4934 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4935 assert(getWideningDecision(I, VF) == CM_Unknown && 4936 "Decision should not be set yet."); 4937 auto *Group = getInterleavedAccessGroup(I); 4938 assert(Group && "Must have a group."); 4939 4940 // If the instruction's allocated size doesn't equal it's type size, it 4941 // requires padding and will be scalarized. 4942 auto &DL = I->getModule()->getDataLayout(); 4943 auto *ScalarTy = getMemInstValueType(I); 4944 if (hasIrregularType(ScalarTy, DL, VF)) 4945 return false; 4946 4947 // Check if masking is required. 4948 // A Group may need masking for one of two reasons: it resides in a block that 4949 // needs predication, or it was decided to use masking to deal with gaps. 4950 bool PredicatedAccessRequiresMasking = 4951 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4952 bool AccessWithGapsRequiresMasking = 4953 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4954 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4955 return true; 4956 4957 // If masked interleaving is required, we expect that the user/target had 4958 // enabled it, because otherwise it either wouldn't have been created or 4959 // it should have been invalidated by the CostModel. 4960 assert(useMaskedInterleavedAccesses(TTI) && 4961 "Masked interleave-groups for predicated accesses are not enabled."); 4962 4963 auto *Ty = getMemInstValueType(I); 4964 const Align Alignment = getLoadStoreAlignment(I); 4965 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4966 : TTI.isLegalMaskedStore(Ty, Alignment); 4967 } 4968 4969 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4970 Instruction *I, ElementCount VF) { 4971 // Get and ensure we have a valid memory instruction. 4972 LoadInst *LI = dyn_cast<LoadInst>(I); 4973 StoreInst *SI = dyn_cast<StoreInst>(I); 4974 assert((LI || SI) && "Invalid memory instruction"); 4975 4976 auto *Ptr = getLoadStorePointerOperand(I); 4977 4978 // In order to be widened, the pointer should be consecutive, first of all. 4979 if (!Legal->isConsecutivePtr(Ptr)) 4980 return false; 4981 4982 // If the instruction is a store located in a predicated block, it will be 4983 // scalarized. 4984 if (isScalarWithPredication(I)) 4985 return false; 4986 4987 // If the instruction's allocated size doesn't equal it's type size, it 4988 // requires padding and will be scalarized. 4989 auto &DL = I->getModule()->getDataLayout(); 4990 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4991 if (hasIrregularType(ScalarTy, DL, VF)) 4992 return false; 4993 4994 return true; 4995 } 4996 4997 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4998 // We should not collect Uniforms more than once per VF. Right now, 4999 // this function is called from collectUniformsAndScalars(), which 5000 // already does this check. Collecting Uniforms for VF=1 does not make any 5001 // sense. 5002 5003 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5004 "This function should not be visited twice for the same VF"); 5005 5006 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5007 // not analyze again. Uniforms.count(VF) will return 1. 5008 Uniforms[VF].clear(); 5009 5010 // We now know that the loop is vectorizable! 5011 // Collect instructions inside the loop that will remain uniform after 5012 // vectorization. 5013 5014 // Global values, params and instructions outside of current loop are out of 5015 // scope. 5016 auto isOutOfScope = [&](Value *V) -> bool { 5017 Instruction *I = dyn_cast<Instruction>(V); 5018 return (!I || !TheLoop->contains(I)); 5019 }; 5020 5021 SetVector<Instruction *> Worklist; 5022 BasicBlock *Latch = TheLoop->getLoopLatch(); 5023 5024 // Instructions that are scalar with predication must not be considered 5025 // uniform after vectorization, because that would create an erroneous 5026 // replicating region where only a single instance out of VF should be formed. 5027 // TODO: optimize such seldom cases if found important, see PR40816. 5028 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5029 if (isScalarWithPredication(I, VF)) { 5030 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5031 << *I << "\n"); 5032 return; 5033 } 5034 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5035 Worklist.insert(I); 5036 }; 5037 5038 // Start with the conditional branch. If the branch condition is an 5039 // instruction contained in the loop that is only used by the branch, it is 5040 // uniform. 5041 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5042 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5043 addToWorklistIfAllowed(Cmp); 5044 5045 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5046 // are pointers that are treated like consecutive pointers during 5047 // vectorization. The pointer operands of interleaved accesses are an 5048 // example. 5049 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5050 5051 // Holds pointer operands of instructions that are possibly non-uniform. 5052 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5053 5054 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5055 InstWidening WideningDecision = getWideningDecision(I, VF); 5056 assert(WideningDecision != CM_Unknown && 5057 "Widening decision should be ready at this moment"); 5058 5059 return (WideningDecision == CM_Widen || 5060 WideningDecision == CM_Widen_Reverse || 5061 WideningDecision == CM_Interleave); 5062 }; 5063 // Iterate over the instructions in the loop, and collect all 5064 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5065 // that a consecutive-like pointer operand will be scalarized, we collect it 5066 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5067 // getelementptr instruction can be used by both vectorized and scalarized 5068 // memory instructions. For example, if a loop loads and stores from the same 5069 // location, but the store is conditional, the store will be scalarized, and 5070 // the getelementptr won't remain uniform. 5071 for (auto *BB : TheLoop->blocks()) 5072 for (auto &I : *BB) { 5073 // If there's no pointer operand, there's nothing to do. 5074 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5075 if (!Ptr) 5076 continue; 5077 5078 // True if all users of Ptr are memory accesses that have Ptr as their 5079 // pointer operand. 5080 auto UsersAreMemAccesses = 5081 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5082 return getLoadStorePointerOperand(U) == Ptr; 5083 }); 5084 5085 // Ensure the memory instruction will not be scalarized or used by 5086 // gather/scatter, making its pointer operand non-uniform. If the pointer 5087 // operand is used by any instruction other than a memory access, we 5088 // conservatively assume the pointer operand may be non-uniform. 5089 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5090 PossibleNonUniformPtrs.insert(Ptr); 5091 5092 // If the memory instruction will be vectorized and its pointer operand 5093 // is consecutive-like, or interleaving - the pointer operand should 5094 // remain uniform. 5095 else 5096 ConsecutiveLikePtrs.insert(Ptr); 5097 } 5098 5099 // Add to the Worklist all consecutive and consecutive-like pointers that 5100 // aren't also identified as possibly non-uniform. 5101 for (auto *V : ConsecutiveLikePtrs) 5102 if (!PossibleNonUniformPtrs.count(V)) 5103 addToWorklistIfAllowed(V); 5104 5105 // Expand Worklist in topological order: whenever a new instruction 5106 // is added , its users should be already inside Worklist. It ensures 5107 // a uniform instruction will only be used by uniform instructions. 5108 unsigned idx = 0; 5109 while (idx != Worklist.size()) { 5110 Instruction *I = Worklist[idx++]; 5111 5112 for (auto OV : I->operand_values()) { 5113 // isOutOfScope operands cannot be uniform instructions. 5114 if (isOutOfScope(OV)) 5115 continue; 5116 // First order recurrence Phi's should typically be considered 5117 // non-uniform. 5118 auto *OP = dyn_cast<PHINode>(OV); 5119 if (OP && Legal->isFirstOrderRecurrence(OP)) 5120 continue; 5121 // If all the users of the operand are uniform, then add the 5122 // operand into the uniform worklist. 5123 auto *OI = cast<Instruction>(OV); 5124 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5125 auto *J = cast<Instruction>(U); 5126 return Worklist.count(J) || 5127 (OI == getLoadStorePointerOperand(J) && 5128 isUniformDecision(J, VF)); 5129 })) 5130 addToWorklistIfAllowed(OI); 5131 } 5132 } 5133 5134 // Returns true if Ptr is the pointer operand of a memory access instruction 5135 // I, and I is known to not require scalarization. 5136 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5137 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5138 }; 5139 5140 // For an instruction to be added into Worklist above, all its users inside 5141 // the loop should also be in Worklist. However, this condition cannot be 5142 // true for phi nodes that form a cyclic dependence. We must process phi 5143 // nodes separately. An induction variable will remain uniform if all users 5144 // of the induction variable and induction variable update remain uniform. 5145 // The code below handles both pointer and non-pointer induction variables. 5146 for (auto &Induction : Legal->getInductionVars()) { 5147 auto *Ind = Induction.first; 5148 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5149 5150 // Determine if all users of the induction variable are uniform after 5151 // vectorization. 5152 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5153 auto *I = cast<Instruction>(U); 5154 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5155 isVectorizedMemAccessUse(I, Ind); 5156 }); 5157 if (!UniformInd) 5158 continue; 5159 5160 // Determine if all users of the induction variable update instruction are 5161 // uniform after vectorization. 5162 auto UniformIndUpdate = 5163 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5164 auto *I = cast<Instruction>(U); 5165 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5166 isVectorizedMemAccessUse(I, IndUpdate); 5167 }); 5168 if (!UniformIndUpdate) 5169 continue; 5170 5171 // The induction variable and its update instruction will remain uniform. 5172 addToWorklistIfAllowed(Ind); 5173 addToWorklistIfAllowed(IndUpdate); 5174 } 5175 5176 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5177 } 5178 5179 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5180 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5181 5182 if (Legal->getRuntimePointerChecking()->Need) { 5183 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5184 "runtime pointer checks needed. Enable vectorization of this " 5185 "loop with '#pragma clang loop vectorize(enable)' when " 5186 "compiling with -Os/-Oz", 5187 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5188 return true; 5189 } 5190 5191 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5192 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5193 "runtime SCEV checks needed. Enable vectorization of this " 5194 "loop with '#pragma clang loop vectorize(enable)' when " 5195 "compiling with -Os/-Oz", 5196 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5197 return true; 5198 } 5199 5200 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5201 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5202 reportVectorizationFailure("Runtime stride check for small trip count", 5203 "runtime stride == 1 checks needed. Enable vectorization of " 5204 "this loop without such check by compiling with -Os/-Oz", 5205 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5206 return true; 5207 } 5208 5209 return false; 5210 } 5211 5212 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5213 unsigned UserIC) { 5214 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5215 // TODO: It may by useful to do since it's still likely to be dynamically 5216 // uniform if the target can skip. 5217 reportVectorizationFailure( 5218 "Not inserting runtime ptr check for divergent target", 5219 "runtime pointer checks needed. Not enabled for divergent target", 5220 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5221 return None; 5222 } 5223 5224 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5225 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5226 if (TC == 1) { 5227 reportVectorizationFailure("Single iteration (non) loop", 5228 "loop trip count is one, irrelevant for vectorization", 5229 "SingleIterationLoop", ORE, TheLoop); 5230 return None; 5231 } 5232 5233 switch (ScalarEpilogueStatus) { 5234 case CM_ScalarEpilogueAllowed: 5235 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5236 case CM_ScalarEpilogueNotNeededUsePredicate: 5237 LLVM_DEBUG( 5238 dbgs() << "LV: vector predicate hint/switch found.\n" 5239 << "LV: Not allowing scalar epilogue, creating predicated " 5240 << "vector loop.\n"); 5241 break; 5242 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5243 // fallthrough as a special case of OptForSize 5244 case CM_ScalarEpilogueNotAllowedOptSize: 5245 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5246 LLVM_DEBUG( 5247 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5248 else 5249 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5250 << "count.\n"); 5251 5252 // Bail if runtime checks are required, which are not good when optimising 5253 // for size. 5254 if (runtimeChecksRequired()) 5255 return None; 5256 break; 5257 } 5258 5259 // Now try the tail folding 5260 5261 // Invalidate interleave groups that require an epilogue if we can't mask 5262 // the interleave-group. 5263 if (!useMaskedInterleavedAccesses(TTI)) { 5264 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5265 "No decisions should have been taken at this point"); 5266 // Note: There is no need to invalidate any cost modeling decisions here, as 5267 // non where taken so far. 5268 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5269 } 5270 5271 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5272 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5273 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5274 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5275 // Accept MaxVF if we do not have a tail. 5276 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5277 return MaxVF; 5278 } 5279 5280 // If we don't know the precise trip count, or if the trip count that we 5281 // found modulo the vectorization factor is not zero, try to fold the tail 5282 // by masking. 5283 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5284 if (Legal->prepareToFoldTailByMasking()) { 5285 FoldTailByMasking = true; 5286 return MaxVF; 5287 } 5288 5289 // If there was a tail-folding hint/switch, but we can't fold the tail by 5290 // masking, fallback to a vectorization with a scalar epilogue. 5291 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5292 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5293 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5294 return None; 5295 } 5296 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5297 "scalar epilogue instead.\n"); 5298 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5299 return MaxVF; 5300 } 5301 5302 if (TC == 0) { 5303 reportVectorizationFailure( 5304 "Unable to calculate the loop count due to complex control flow", 5305 "unable to calculate the loop count due to complex control flow", 5306 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5307 return None; 5308 } 5309 5310 reportVectorizationFailure( 5311 "Cannot optimize for size and vectorize at the same time.", 5312 "cannot optimize for size and vectorize at the same time. " 5313 "Enable vectorization of this loop with '#pragma clang loop " 5314 "vectorize(enable)' when compiling with -Os/-Oz", 5315 "NoTailLoopWithOptForSize", ORE, TheLoop); 5316 return None; 5317 } 5318 5319 unsigned 5320 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5321 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5322 unsigned SmallestType, WidestType; 5323 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5324 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5325 5326 // Get the maximum safe dependence distance in bits computed by LAA. 5327 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5328 // the memory accesses that is most restrictive (involved in the smallest 5329 // dependence distance). 5330 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5331 5332 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5333 5334 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5335 // Note that both WidestRegister and WidestType may not be a powers of 2. 5336 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5337 5338 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5339 << " / " << WidestType << " bits.\n"); 5340 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5341 << WidestRegister << " bits.\n"); 5342 5343 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5344 " into one vector!"); 5345 if (MaxVectorSize == 0) { 5346 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5347 MaxVectorSize = 1; 5348 return MaxVectorSize; 5349 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5350 isPowerOf2_32(ConstTripCount)) { 5351 // We need to clamp the VF to be the ConstTripCount. There is no point in 5352 // choosing a higher viable VF as done in the loop below. 5353 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5354 << ConstTripCount << "\n"); 5355 MaxVectorSize = ConstTripCount; 5356 return MaxVectorSize; 5357 } 5358 5359 unsigned MaxVF = MaxVectorSize; 5360 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5361 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5362 // Collect all viable vectorization factors larger than the default MaxVF 5363 // (i.e. MaxVectorSize). 5364 SmallVector<ElementCount, 8> VFs; 5365 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5366 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5367 VFs.push_back(ElementCount::getFixed(VS)); 5368 5369 // For each VF calculate its register usage. 5370 auto RUs = calculateRegisterUsage(VFs); 5371 5372 // Select the largest VF which doesn't require more registers than existing 5373 // ones. 5374 for (int i = RUs.size() - 1; i >= 0; --i) { 5375 bool Selected = true; 5376 for (auto& pair : RUs[i].MaxLocalUsers) { 5377 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5378 if (pair.second > TargetNumRegisters) 5379 Selected = false; 5380 } 5381 if (Selected) { 5382 MaxVF = VFs[i].getKnownMinValue(); 5383 break; 5384 } 5385 } 5386 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5387 if (MaxVF < MinVF) { 5388 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5389 << ") with target's minimum: " << MinVF << '\n'); 5390 MaxVF = MinVF; 5391 } 5392 } 5393 } 5394 return MaxVF; 5395 } 5396 5397 VectorizationFactor 5398 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5399 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5400 const float ScalarCost = Cost; 5401 unsigned Width = 1; 5402 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5403 5404 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5405 if (ForceVectorization && MaxVF > 1) { 5406 // Ignore scalar width, because the user explicitly wants vectorization. 5407 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5408 // evaluation. 5409 Cost = std::numeric_limits<float>::max(); 5410 } 5411 5412 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5413 // Notice that the vector loop needs to be executed less times, so 5414 // we need to divide the cost of the vector loops by the width of 5415 // the vector elements. 5416 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5417 float VectorCost = C.first / (float)i; 5418 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5419 << " costs: " << (int)VectorCost << ".\n"); 5420 if (!C.second && !ForceVectorization) { 5421 LLVM_DEBUG( 5422 dbgs() << "LV: Not considering vector loop of width " << i 5423 << " because it will not generate any vector instructions.\n"); 5424 continue; 5425 } 5426 if (VectorCost < Cost) { 5427 Cost = VectorCost; 5428 Width = i; 5429 } 5430 } 5431 5432 if (!EnableCondStoresVectorization && NumPredStores) { 5433 reportVectorizationFailure("There are conditional stores.", 5434 "store that is conditionally executed prevents vectorization", 5435 "ConditionalStore", ORE, TheLoop); 5436 Width = 1; 5437 Cost = ScalarCost; 5438 } 5439 5440 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5441 << "LV: Vectorization seems to be not beneficial, " 5442 << "but was forced by a user.\n"); 5443 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5444 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5445 (unsigned)(Width * Cost)}; 5446 return Factor; 5447 } 5448 5449 std::pair<unsigned, unsigned> 5450 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5451 unsigned MinWidth = -1U; 5452 unsigned MaxWidth = 8; 5453 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5454 5455 // For each block. 5456 for (BasicBlock *BB : TheLoop->blocks()) { 5457 // For each instruction in the loop. 5458 for (Instruction &I : BB->instructionsWithoutDebug()) { 5459 Type *T = I.getType(); 5460 5461 // Skip ignored values. 5462 if (ValuesToIgnore.count(&I)) 5463 continue; 5464 5465 // Only examine Loads, Stores and PHINodes. 5466 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5467 continue; 5468 5469 // Examine PHI nodes that are reduction variables. Update the type to 5470 // account for the recurrence type. 5471 if (auto *PN = dyn_cast<PHINode>(&I)) { 5472 if (!Legal->isReductionVariable(PN)) 5473 continue; 5474 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5475 T = RdxDesc.getRecurrenceType(); 5476 } 5477 5478 // Examine the stored values. 5479 if (auto *ST = dyn_cast<StoreInst>(&I)) 5480 T = ST->getValueOperand()->getType(); 5481 5482 // Ignore loaded pointer types and stored pointer types that are not 5483 // vectorizable. 5484 // 5485 // FIXME: The check here attempts to predict whether a load or store will 5486 // be vectorized. We only know this for certain after a VF has 5487 // been selected. Here, we assume that if an access can be 5488 // vectorized, it will be. We should also look at extending this 5489 // optimization to non-pointer types. 5490 // 5491 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5492 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5493 continue; 5494 5495 MinWidth = std::min(MinWidth, 5496 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5497 MaxWidth = std::max(MaxWidth, 5498 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5499 } 5500 } 5501 5502 return {MinWidth, MaxWidth}; 5503 } 5504 5505 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5506 unsigned LoopCost) { 5507 // -- The interleave heuristics -- 5508 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5509 // There are many micro-architectural considerations that we can't predict 5510 // at this level. For example, frontend pressure (on decode or fetch) due to 5511 // code size, or the number and capabilities of the execution ports. 5512 // 5513 // We use the following heuristics to select the interleave count: 5514 // 1. If the code has reductions, then we interleave to break the cross 5515 // iteration dependency. 5516 // 2. If the loop is really small, then we interleave to reduce the loop 5517 // overhead. 5518 // 3. We don't interleave if we think that we will spill registers to memory 5519 // due to the increased register pressure. 5520 5521 if (!isScalarEpilogueAllowed()) 5522 return 1; 5523 5524 // We used the distance for the interleave count. 5525 if (Legal->getMaxSafeDepDistBytes() != -1U) 5526 return 1; 5527 5528 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5529 const bool HasReductions = !Legal->getReductionVars().empty(); 5530 // Do not interleave loops with a relatively small known or estimated trip 5531 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5532 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5533 // because with the above conditions interleaving can expose ILP and break 5534 // cross iteration dependences for reductions. 5535 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5536 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5537 return 1; 5538 5539 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5540 // We divide by these constants so assume that we have at least one 5541 // instruction that uses at least one register. 5542 for (auto& pair : R.MaxLocalUsers) { 5543 pair.second = std::max(pair.second, 1U); 5544 } 5545 5546 // We calculate the interleave count using the following formula. 5547 // Subtract the number of loop invariants from the number of available 5548 // registers. These registers are used by all of the interleaved instances. 5549 // Next, divide the remaining registers by the number of registers that is 5550 // required by the loop, in order to estimate how many parallel instances 5551 // fit without causing spills. All of this is rounded down if necessary to be 5552 // a power of two. We want power of two interleave count to simplify any 5553 // addressing operations or alignment considerations. 5554 // We also want power of two interleave counts to ensure that the induction 5555 // variable of the vector loop wraps to zero, when tail is folded by masking; 5556 // this currently happens when OptForSize, in which case IC is set to 1 above. 5557 unsigned IC = UINT_MAX; 5558 5559 for (auto& pair : R.MaxLocalUsers) { 5560 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5561 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5562 << " registers of " 5563 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5564 if (VF.isScalar()) { 5565 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5566 TargetNumRegisters = ForceTargetNumScalarRegs; 5567 } else { 5568 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5569 TargetNumRegisters = ForceTargetNumVectorRegs; 5570 } 5571 unsigned MaxLocalUsers = pair.second; 5572 unsigned LoopInvariantRegs = 0; 5573 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5574 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5575 5576 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5577 // Don't count the induction variable as interleaved. 5578 if (EnableIndVarRegisterHeur) { 5579 TmpIC = 5580 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5581 std::max(1U, (MaxLocalUsers - 1))); 5582 } 5583 5584 IC = std::min(IC, TmpIC); 5585 } 5586 5587 // Clamp the interleave ranges to reasonable counts. 5588 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5589 unsigned MaxInterleaveCount = 5590 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5591 5592 // Check if the user has overridden the max. 5593 if (VF.isScalar()) { 5594 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5595 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5596 } else { 5597 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5598 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5599 } 5600 5601 // If trip count is known or estimated compile time constant, limit the 5602 // interleave count to be less than the trip count divided by VF. 5603 if (BestKnownTC) { 5604 MaxInterleaveCount = 5605 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5606 } 5607 5608 // If we did not calculate the cost for VF (because the user selected the VF) 5609 // then we calculate the cost of VF here. 5610 if (LoopCost == 0) 5611 LoopCost = expectedCost(VF).first; 5612 5613 assert(LoopCost && "Non-zero loop cost expected"); 5614 5615 // Clamp the calculated IC to be between the 1 and the max interleave count 5616 // that the target and trip count allows. 5617 if (IC > MaxInterleaveCount) 5618 IC = MaxInterleaveCount; 5619 else if (IC < 1) 5620 IC = 1; 5621 5622 // Interleave if we vectorized this loop and there is a reduction that could 5623 // benefit from interleaving. 5624 if (VF.isVector() && HasReductions) { 5625 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5626 return IC; 5627 } 5628 5629 // Note that if we've already vectorized the loop we will have done the 5630 // runtime check and so interleaving won't require further checks. 5631 bool InterleavingRequiresRuntimePointerCheck = 5632 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5633 5634 // We want to interleave small loops in order to reduce the loop overhead and 5635 // potentially expose ILP opportunities. 5636 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5637 << "LV: IC is " << IC << '\n' 5638 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 5639 const bool AggressivelyInterleaveReductions = 5640 TTI.enableAggressiveInterleaving(HasReductions); 5641 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5642 // We assume that the cost overhead is 1 and we use the cost model 5643 // to estimate the cost of the loop and interleave until the cost of the 5644 // loop overhead is about 5% of the cost of the loop. 5645 unsigned SmallIC = 5646 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5647 5648 // Interleave until store/load ports (estimated by max interleave count) are 5649 // saturated. 5650 unsigned NumStores = Legal->getNumStores(); 5651 unsigned NumLoads = Legal->getNumLoads(); 5652 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5653 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5654 5655 // If we have a scalar reduction (vector reductions are already dealt with 5656 // by this point), we can increase the critical path length if the loop 5657 // we're interleaving is inside another loop. Limit, by default to 2, so the 5658 // critical path only gets increased by one reduction operation. 5659 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5660 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5661 SmallIC = std::min(SmallIC, F); 5662 StoresIC = std::min(StoresIC, F); 5663 LoadsIC = std::min(LoadsIC, F); 5664 } 5665 5666 if (EnableLoadStoreRuntimeInterleave && 5667 std::max(StoresIC, LoadsIC) > SmallIC) { 5668 LLVM_DEBUG( 5669 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5670 return std::max(StoresIC, LoadsIC); 5671 } 5672 5673 // If there are scalar reductions and TTI has enabled aggressive 5674 // interleaving for reductions, we will interleave to expose ILP. 5675 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5676 AggressivelyInterleaveReductions) { 5677 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5678 // Interleave no less than SmallIC but not as aggressive as the normal IC 5679 // to satisfy the rare situation when resources are too limited. 5680 return std::max(IC / 2, SmallIC); 5681 } else { 5682 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5683 return SmallIC; 5684 } 5685 } 5686 5687 // Interleave if this is a large loop (small loops are already dealt with by 5688 // this point) that could benefit from interleaving. 5689 if (AggressivelyInterleaveReductions) { 5690 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5691 return IC; 5692 } 5693 5694 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5695 return 1; 5696 } 5697 5698 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5699 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5700 // This function calculates the register usage by measuring the highest number 5701 // of values that are alive at a single location. Obviously, this is a very 5702 // rough estimation. We scan the loop in a topological order in order and 5703 // assign a number to each instruction. We use RPO to ensure that defs are 5704 // met before their users. We assume that each instruction that has in-loop 5705 // users starts an interval. We record every time that an in-loop value is 5706 // used, so we have a list of the first and last occurrences of each 5707 // instruction. Next, we transpose this data structure into a multi map that 5708 // holds the list of intervals that *end* at a specific location. This multi 5709 // map allows us to perform a linear search. We scan the instructions linearly 5710 // and record each time that a new interval starts, by placing it in a set. 5711 // If we find this value in the multi-map then we remove it from the set. 5712 // The max register usage is the maximum size of the set. 5713 // We also search for instructions that are defined outside the loop, but are 5714 // used inside the loop. We need this number separately from the max-interval 5715 // usage number because when we unroll, loop-invariant values do not take 5716 // more register. 5717 LoopBlocksDFS DFS(TheLoop); 5718 DFS.perform(LI); 5719 5720 RegisterUsage RU; 5721 5722 // Each 'key' in the map opens a new interval. The values 5723 // of the map are the index of the 'last seen' usage of the 5724 // instruction that is the key. 5725 using IntervalMap = DenseMap<Instruction *, unsigned>; 5726 5727 // Maps instruction to its index. 5728 SmallVector<Instruction *, 64> IdxToInstr; 5729 // Marks the end of each interval. 5730 IntervalMap EndPoint; 5731 // Saves the list of instruction indices that are used in the loop. 5732 SmallPtrSet<Instruction *, 8> Ends; 5733 // Saves the list of values that are used in the loop but are 5734 // defined outside the loop, such as arguments and constants. 5735 SmallPtrSet<Value *, 8> LoopInvariants; 5736 5737 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5738 for (Instruction &I : BB->instructionsWithoutDebug()) { 5739 IdxToInstr.push_back(&I); 5740 5741 // Save the end location of each USE. 5742 for (Value *U : I.operands()) { 5743 auto *Instr = dyn_cast<Instruction>(U); 5744 5745 // Ignore non-instruction values such as arguments, constants, etc. 5746 if (!Instr) 5747 continue; 5748 5749 // If this instruction is outside the loop then record it and continue. 5750 if (!TheLoop->contains(Instr)) { 5751 LoopInvariants.insert(Instr); 5752 continue; 5753 } 5754 5755 // Overwrite previous end points. 5756 EndPoint[Instr] = IdxToInstr.size(); 5757 Ends.insert(Instr); 5758 } 5759 } 5760 } 5761 5762 // Saves the list of intervals that end with the index in 'key'. 5763 using InstrList = SmallVector<Instruction *, 2>; 5764 DenseMap<unsigned, InstrList> TransposeEnds; 5765 5766 // Transpose the EndPoints to a list of values that end at each index. 5767 for (auto &Interval : EndPoint) 5768 TransposeEnds[Interval.second].push_back(Interval.first); 5769 5770 SmallPtrSet<Instruction *, 8> OpenIntervals; 5771 5772 // Get the size of the widest register. 5773 unsigned MaxSafeDepDist = -1U; 5774 if (Legal->getMaxSafeDepDistBytes() != -1U) 5775 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5776 unsigned WidestRegister = 5777 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5778 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5779 5780 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5781 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5782 5783 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5784 5785 // A lambda that gets the register usage for the given type and VF. 5786 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5787 if (Ty->isTokenTy()) 5788 return 0U; 5789 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5790 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5791 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5792 WidestRegister); 5793 }; 5794 5795 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5796 Instruction *I = IdxToInstr[i]; 5797 5798 // Remove all of the instructions that end at this location. 5799 InstrList &List = TransposeEnds[i]; 5800 for (Instruction *ToRemove : List) 5801 OpenIntervals.erase(ToRemove); 5802 5803 // Ignore instructions that are never used within the loop. 5804 if (!Ends.count(I)) 5805 continue; 5806 5807 // Skip ignored values. 5808 if (ValuesToIgnore.count(I)) 5809 continue; 5810 5811 // For each VF find the maximum usage of registers. 5812 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5813 // Count the number of live intervals. 5814 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5815 5816 if (VFs[j].isScalar()) { 5817 for (auto Inst : OpenIntervals) { 5818 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5819 if (RegUsage.find(ClassID) == RegUsage.end()) 5820 RegUsage[ClassID] = 1; 5821 else 5822 RegUsage[ClassID] += 1; 5823 } 5824 } else { 5825 collectUniformsAndScalars(VFs[j]); 5826 for (auto Inst : OpenIntervals) { 5827 // Skip ignored values for VF > 1. 5828 if (VecValuesToIgnore.count(Inst)) 5829 continue; 5830 if (isScalarAfterVectorization(Inst, VFs[j])) { 5831 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5832 if (RegUsage.find(ClassID) == RegUsage.end()) 5833 RegUsage[ClassID] = 1; 5834 else 5835 RegUsage[ClassID] += 1; 5836 } else { 5837 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5838 if (RegUsage.find(ClassID) == RegUsage.end()) 5839 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5840 else 5841 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5842 } 5843 } 5844 } 5845 5846 for (auto& pair : RegUsage) { 5847 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5848 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5849 else 5850 MaxUsages[j][pair.first] = pair.second; 5851 } 5852 } 5853 5854 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5855 << OpenIntervals.size() << '\n'); 5856 5857 // Add the current instruction to the list of open intervals. 5858 OpenIntervals.insert(I); 5859 } 5860 5861 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5862 SmallMapVector<unsigned, unsigned, 4> Invariant; 5863 5864 for (auto Inst : LoopInvariants) { 5865 unsigned Usage = 5866 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5867 unsigned ClassID = 5868 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5869 if (Invariant.find(ClassID) == Invariant.end()) 5870 Invariant[ClassID] = Usage; 5871 else 5872 Invariant[ClassID] += Usage; 5873 } 5874 5875 LLVM_DEBUG({ 5876 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5877 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5878 << " item\n"; 5879 for (const auto &pair : MaxUsages[i]) { 5880 dbgs() << "LV(REG): RegisterClass: " 5881 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5882 << " registers\n"; 5883 } 5884 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5885 << " item\n"; 5886 for (const auto &pair : Invariant) { 5887 dbgs() << "LV(REG): RegisterClass: " 5888 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5889 << " registers\n"; 5890 } 5891 }); 5892 5893 RU.LoopInvariantRegs = Invariant; 5894 RU.MaxLocalUsers = MaxUsages[i]; 5895 RUs[i] = RU; 5896 } 5897 5898 return RUs; 5899 } 5900 5901 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5902 // TODO: Cost model for emulated masked load/store is completely 5903 // broken. This hack guides the cost model to use an artificially 5904 // high enough value to practically disable vectorization with such 5905 // operations, except where previously deployed legality hack allowed 5906 // using very low cost values. This is to avoid regressions coming simply 5907 // from moving "masked load/store" check from legality to cost model. 5908 // Masked Load/Gather emulation was previously never allowed. 5909 // Limited number of Masked Store/Scatter emulation was allowed. 5910 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5911 return isa<LoadInst>(I) || 5912 (isa<StoreInst>(I) && 5913 NumPredStores > NumberOfStoresToPredicate); 5914 } 5915 5916 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5917 // If we aren't vectorizing the loop, or if we've already collected the 5918 // instructions to scalarize, there's nothing to do. Collection may already 5919 // have occurred if we have a user-selected VF and are now computing the 5920 // expected cost for interleaving. 5921 if (VF.isScalar() || VF.isZero() || 5922 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5923 return; 5924 5925 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5926 // not profitable to scalarize any instructions, the presence of VF in the 5927 // map will indicate that we've analyzed it already. 5928 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5929 5930 // Find all the instructions that are scalar with predication in the loop and 5931 // determine if it would be better to not if-convert the blocks they are in. 5932 // If so, we also record the instructions to scalarize. 5933 for (BasicBlock *BB : TheLoop->blocks()) { 5934 if (!blockNeedsPredication(BB)) 5935 continue; 5936 for (Instruction &I : *BB) 5937 if (isScalarWithPredication(&I)) { 5938 ScalarCostsTy ScalarCosts; 5939 // Do not apply discount logic if hacked cost is needed 5940 // for emulated masked memrefs. 5941 if (!useEmulatedMaskMemRefHack(&I) && 5942 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5943 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5944 // Remember that BB will remain after vectorization. 5945 PredicatedBBsAfterVectorization.insert(BB); 5946 } 5947 } 5948 } 5949 5950 int LoopVectorizationCostModel::computePredInstDiscount( 5951 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5952 ElementCount VF) { 5953 assert(!isUniformAfterVectorization(PredInst, VF) && 5954 "Instruction marked uniform-after-vectorization will be predicated"); 5955 5956 // Initialize the discount to zero, meaning that the scalar version and the 5957 // vector version cost the same. 5958 int Discount = 0; 5959 5960 // Holds instructions to analyze. The instructions we visit are mapped in 5961 // ScalarCosts. Those instructions are the ones that would be scalarized if 5962 // we find that the scalar version costs less. 5963 SmallVector<Instruction *, 8> Worklist; 5964 5965 // Returns true if the given instruction can be scalarized. 5966 auto canBeScalarized = [&](Instruction *I) -> bool { 5967 // We only attempt to scalarize instructions forming a single-use chain 5968 // from the original predicated block that would otherwise be vectorized. 5969 // Although not strictly necessary, we give up on instructions we know will 5970 // already be scalar to avoid traversing chains that are unlikely to be 5971 // beneficial. 5972 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5973 isScalarAfterVectorization(I, VF)) 5974 return false; 5975 5976 // If the instruction is scalar with predication, it will be analyzed 5977 // separately. We ignore it within the context of PredInst. 5978 if (isScalarWithPredication(I)) 5979 return false; 5980 5981 // If any of the instruction's operands are uniform after vectorization, 5982 // the instruction cannot be scalarized. This prevents, for example, a 5983 // masked load from being scalarized. 5984 // 5985 // We assume we will only emit a value for lane zero of an instruction 5986 // marked uniform after vectorization, rather than VF identical values. 5987 // Thus, if we scalarize an instruction that uses a uniform, we would 5988 // create uses of values corresponding to the lanes we aren't emitting code 5989 // for. This behavior can be changed by allowing getScalarValue to clone 5990 // the lane zero values for uniforms rather than asserting. 5991 for (Use &U : I->operands()) 5992 if (auto *J = dyn_cast<Instruction>(U.get())) 5993 if (isUniformAfterVectorization(J, VF)) 5994 return false; 5995 5996 // Otherwise, we can scalarize the instruction. 5997 return true; 5998 }; 5999 6000 // Compute the expected cost discount from scalarizing the entire expression 6001 // feeding the predicated instruction. We currently only consider expressions 6002 // that are single-use instruction chains. 6003 Worklist.push_back(PredInst); 6004 while (!Worklist.empty()) { 6005 Instruction *I = Worklist.pop_back_val(); 6006 6007 // If we've already analyzed the instruction, there's nothing to do. 6008 if (ScalarCosts.find(I) != ScalarCosts.end()) 6009 continue; 6010 6011 // Compute the cost of the vector instruction. Note that this cost already 6012 // includes the scalarization overhead of the predicated instruction. 6013 unsigned VectorCost = getInstructionCost(I, VF).first; 6014 6015 // Compute the cost of the scalarized instruction. This cost is the cost of 6016 // the instruction as if it wasn't if-converted and instead remained in the 6017 // predicated block. We will scale this cost by block probability after 6018 // computing the scalarization overhead. 6019 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6020 unsigned ScalarCost = 6021 VF.getKnownMinValue() * 6022 getInstructionCost(I, ElementCount::getFixed(1)).first; 6023 6024 // Compute the scalarization overhead of needed insertelement instructions 6025 // and phi nodes. 6026 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6027 ScalarCost += TTI.getScalarizationOverhead( 6028 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6029 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6030 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6031 ScalarCost += 6032 VF.getKnownMinValue() * 6033 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6034 } 6035 6036 // Compute the scalarization overhead of needed extractelement 6037 // instructions. For each of the instruction's operands, if the operand can 6038 // be scalarized, add it to the worklist; otherwise, account for the 6039 // overhead. 6040 for (Use &U : I->operands()) 6041 if (auto *J = dyn_cast<Instruction>(U.get())) { 6042 assert(VectorType::isValidElementType(J->getType()) && 6043 "Instruction has non-scalar type"); 6044 if (canBeScalarized(J)) 6045 Worklist.push_back(J); 6046 else if (needsExtract(J, VF)) { 6047 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6048 ScalarCost += TTI.getScalarizationOverhead( 6049 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6050 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6051 } 6052 } 6053 6054 // Scale the total scalar cost by block probability. 6055 ScalarCost /= getReciprocalPredBlockProb(); 6056 6057 // Compute the discount. A non-negative discount means the vector version 6058 // of the instruction costs more, and scalarizing would be beneficial. 6059 Discount += VectorCost - ScalarCost; 6060 ScalarCosts[I] = ScalarCost; 6061 } 6062 6063 return Discount; 6064 } 6065 6066 LoopVectorizationCostModel::VectorizationCostTy 6067 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6068 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6069 VectorizationCostTy Cost; 6070 6071 // For each block. 6072 for (BasicBlock *BB : TheLoop->blocks()) { 6073 VectorizationCostTy BlockCost; 6074 6075 // For each instruction in the old loop. 6076 for (Instruction &I : BB->instructionsWithoutDebug()) { 6077 // Skip ignored values. 6078 if (ValuesToIgnore.count(&I) || 6079 (VF.isVector() && VecValuesToIgnore.count(&I))) 6080 continue; 6081 6082 VectorizationCostTy C = getInstructionCost(&I, VF); 6083 6084 // Check if we should override the cost. 6085 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6086 C.first = ForceTargetInstructionCost; 6087 6088 BlockCost.first += C.first; 6089 BlockCost.second |= C.second; 6090 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6091 << " for VF " << VF << " For instruction: " << I 6092 << '\n'); 6093 } 6094 6095 // If we are vectorizing a predicated block, it will have been 6096 // if-converted. This means that the block's instructions (aside from 6097 // stores and instructions that may divide by zero) will now be 6098 // unconditionally executed. For the scalar case, we may not always execute 6099 // the predicated block. Thus, scale the block's cost by the probability of 6100 // executing it. 6101 if (VF.isScalar() && blockNeedsPredication(BB)) 6102 BlockCost.first /= getReciprocalPredBlockProb(); 6103 6104 Cost.first += BlockCost.first; 6105 Cost.second |= BlockCost.second; 6106 } 6107 6108 return Cost; 6109 } 6110 6111 /// Gets Address Access SCEV after verifying that the access pattern 6112 /// is loop invariant except the induction variable dependence. 6113 /// 6114 /// This SCEV can be sent to the Target in order to estimate the address 6115 /// calculation cost. 6116 static const SCEV *getAddressAccessSCEV( 6117 Value *Ptr, 6118 LoopVectorizationLegality *Legal, 6119 PredicatedScalarEvolution &PSE, 6120 const Loop *TheLoop) { 6121 6122 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6123 if (!Gep) 6124 return nullptr; 6125 6126 // We are looking for a gep with all loop invariant indices except for one 6127 // which should be an induction variable. 6128 auto SE = PSE.getSE(); 6129 unsigned NumOperands = Gep->getNumOperands(); 6130 for (unsigned i = 1; i < NumOperands; ++i) { 6131 Value *Opd = Gep->getOperand(i); 6132 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6133 !Legal->isInductionVariable(Opd)) 6134 return nullptr; 6135 } 6136 6137 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6138 return PSE.getSCEV(Ptr); 6139 } 6140 6141 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6142 return Legal->hasStride(I->getOperand(0)) || 6143 Legal->hasStride(I->getOperand(1)); 6144 } 6145 6146 unsigned 6147 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6148 ElementCount VF) { 6149 assert(VF.isVector() && 6150 "Scalarization cost of instruction implies vectorization."); 6151 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6152 Type *ValTy = getMemInstValueType(I); 6153 auto SE = PSE.getSE(); 6154 6155 unsigned AS = getLoadStoreAddressSpace(I); 6156 Value *Ptr = getLoadStorePointerOperand(I); 6157 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6158 6159 // Figure out whether the access is strided and get the stride value 6160 // if it's known in compile time 6161 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6162 6163 // Get the cost of the scalar memory instruction and address computation. 6164 unsigned Cost = 6165 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6166 6167 // Don't pass *I here, since it is scalar but will actually be part of a 6168 // vectorized loop where the user of it is a vectorized instruction. 6169 const Align Alignment = getLoadStoreAlignment(I); 6170 Cost += VF.getKnownMinValue() * 6171 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6172 AS, TTI::TCK_RecipThroughput); 6173 6174 // Get the overhead of the extractelement and insertelement instructions 6175 // we might create due to scalarization. 6176 Cost += getScalarizationOverhead(I, VF); 6177 6178 // If we have a predicated store, it may not be executed for each vector 6179 // lane. Scale the cost by the probability of executing the predicated 6180 // block. 6181 if (isPredicatedInst(I)) { 6182 Cost /= getReciprocalPredBlockProb(); 6183 6184 if (useEmulatedMaskMemRefHack(I)) 6185 // Artificially setting to a high enough value to practically disable 6186 // vectorization with such operations. 6187 Cost = 3000000; 6188 } 6189 6190 return Cost; 6191 } 6192 6193 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6194 ElementCount VF) { 6195 Type *ValTy = getMemInstValueType(I); 6196 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6197 Value *Ptr = getLoadStorePointerOperand(I); 6198 unsigned AS = getLoadStoreAddressSpace(I); 6199 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6200 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6201 6202 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6203 "Stride should be 1 or -1 for consecutive memory access"); 6204 const Align Alignment = getLoadStoreAlignment(I); 6205 unsigned Cost = 0; 6206 if (Legal->isMaskRequired(I)) 6207 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6208 CostKind); 6209 else 6210 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6211 CostKind, I); 6212 6213 bool Reverse = ConsecutiveStride < 0; 6214 if (Reverse) 6215 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6216 return Cost; 6217 } 6218 6219 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6220 ElementCount VF) { 6221 Type *ValTy = getMemInstValueType(I); 6222 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6223 const Align Alignment = getLoadStoreAlignment(I); 6224 unsigned AS = getLoadStoreAddressSpace(I); 6225 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6226 if (isa<LoadInst>(I)) { 6227 return TTI.getAddressComputationCost(ValTy) + 6228 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6229 CostKind) + 6230 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6231 } 6232 StoreInst *SI = cast<StoreInst>(I); 6233 6234 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6235 return TTI.getAddressComputationCost(ValTy) + 6236 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6237 CostKind) + 6238 (isLoopInvariantStoreValue 6239 ? 0 6240 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6241 VF.getKnownMinValue() - 1)); 6242 } 6243 6244 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6245 ElementCount VF) { 6246 Type *ValTy = getMemInstValueType(I); 6247 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6248 const Align Alignment = getLoadStoreAlignment(I); 6249 const Value *Ptr = getLoadStorePointerOperand(I); 6250 6251 return TTI.getAddressComputationCost(VectorTy) + 6252 TTI.getGatherScatterOpCost( 6253 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6254 TargetTransformInfo::TCK_RecipThroughput, I); 6255 } 6256 6257 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6258 ElementCount VF) { 6259 Type *ValTy = getMemInstValueType(I); 6260 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6261 unsigned AS = getLoadStoreAddressSpace(I); 6262 6263 auto Group = getInterleavedAccessGroup(I); 6264 assert(Group && "Fail to get an interleaved access group."); 6265 6266 unsigned InterleaveFactor = Group->getFactor(); 6267 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6268 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6269 6270 // Holds the indices of existing members in an interleaved load group. 6271 // An interleaved store group doesn't need this as it doesn't allow gaps. 6272 SmallVector<unsigned, 4> Indices; 6273 if (isa<LoadInst>(I)) { 6274 for (unsigned i = 0; i < InterleaveFactor; i++) 6275 if (Group->getMember(i)) 6276 Indices.push_back(i); 6277 } 6278 6279 // Calculate the cost of the whole interleaved group. 6280 bool UseMaskForGaps = 6281 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6282 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6283 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6284 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6285 6286 if (Group->isReverse()) { 6287 // TODO: Add support for reversed masked interleaved access. 6288 assert(!Legal->isMaskRequired(I) && 6289 "Reverse masked interleaved access not supported."); 6290 Cost += Group->getNumMembers() * 6291 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6292 } 6293 return Cost; 6294 } 6295 6296 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6297 ElementCount VF) { 6298 // Calculate scalar cost only. Vectorization cost should be ready at this 6299 // moment. 6300 if (VF.isScalar()) { 6301 Type *ValTy = getMemInstValueType(I); 6302 const Align Alignment = getLoadStoreAlignment(I); 6303 unsigned AS = getLoadStoreAddressSpace(I); 6304 6305 return TTI.getAddressComputationCost(ValTy) + 6306 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6307 TTI::TCK_RecipThroughput, I); 6308 } 6309 return getWideningCost(I, VF); 6310 } 6311 6312 LoopVectorizationCostModel::VectorizationCostTy 6313 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6314 ElementCount VF) { 6315 assert(!VF.isScalable() && 6316 "the cost model is not yet implemented for scalable vectorization"); 6317 // If we know that this instruction will remain uniform, check the cost of 6318 // the scalar version. 6319 if (isUniformAfterVectorization(I, VF)) 6320 VF = ElementCount::getFixed(1); 6321 6322 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6323 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6324 6325 // Forced scalars do not have any scalarization overhead. 6326 auto ForcedScalar = ForcedScalars.find(VF); 6327 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6328 auto InstSet = ForcedScalar->second; 6329 if (InstSet.count(I)) 6330 return VectorizationCostTy( 6331 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6332 VF.getKnownMinValue()), 6333 false); 6334 } 6335 6336 Type *VectorTy; 6337 unsigned C = getInstructionCost(I, VF, VectorTy); 6338 6339 bool TypeNotScalarized = 6340 VF.isVector() && VectorTy->isVectorTy() && 6341 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6342 return VectorizationCostTy(C, TypeNotScalarized); 6343 } 6344 6345 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6346 ElementCount VF) { 6347 6348 assert(!VF.isScalable() && 6349 "cannot compute scalarization overhead for scalable vectorization"); 6350 if (VF.isScalar()) 6351 return 0; 6352 6353 unsigned Cost = 0; 6354 Type *RetTy = ToVectorTy(I->getType(), VF); 6355 if (!RetTy->isVoidTy() && 6356 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6357 Cost += TTI.getScalarizationOverhead( 6358 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6359 true, false); 6360 6361 // Some targets keep addresses scalar. 6362 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6363 return Cost; 6364 6365 // Some targets support efficient element stores. 6366 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6367 return Cost; 6368 6369 // Collect operands to consider. 6370 CallInst *CI = dyn_cast<CallInst>(I); 6371 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6372 6373 // Skip operands that do not require extraction/scalarization and do not incur 6374 // any overhead. 6375 return Cost + TTI.getOperandsScalarizationOverhead( 6376 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6377 } 6378 6379 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6380 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6381 if (VF.isScalar()) 6382 return; 6383 NumPredStores = 0; 6384 for (BasicBlock *BB : TheLoop->blocks()) { 6385 // For each instruction in the old loop. 6386 for (Instruction &I : *BB) { 6387 Value *Ptr = getLoadStorePointerOperand(&I); 6388 if (!Ptr) 6389 continue; 6390 6391 // TODO: We should generate better code and update the cost model for 6392 // predicated uniform stores. Today they are treated as any other 6393 // predicated store (see added test cases in 6394 // invariant-store-vectorization.ll). 6395 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6396 NumPredStores++; 6397 6398 if (Legal->isUniform(Ptr) && 6399 // Conditional loads and stores should be scalarized and predicated. 6400 // isScalarWithPredication cannot be used here since masked 6401 // gather/scatters are not considered scalar with predication. 6402 !Legal->blockNeedsPredication(I.getParent())) { 6403 // TODO: Avoid replicating loads and stores instead of 6404 // relying on instcombine to remove them. 6405 // Load: Scalar load + broadcast 6406 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6407 unsigned Cost = getUniformMemOpCost(&I, VF); 6408 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6409 continue; 6410 } 6411 6412 // We assume that widening is the best solution when possible. 6413 if (memoryInstructionCanBeWidened(&I, VF)) { 6414 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6415 int ConsecutiveStride = 6416 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6417 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6418 "Expected consecutive stride."); 6419 InstWidening Decision = 6420 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6421 setWideningDecision(&I, VF, Decision, Cost); 6422 continue; 6423 } 6424 6425 // Choose between Interleaving, Gather/Scatter or Scalarization. 6426 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6427 unsigned NumAccesses = 1; 6428 if (isAccessInterleaved(&I)) { 6429 auto Group = getInterleavedAccessGroup(&I); 6430 assert(Group && "Fail to get an interleaved access group."); 6431 6432 // Make one decision for the whole group. 6433 if (getWideningDecision(&I, VF) != CM_Unknown) 6434 continue; 6435 6436 NumAccesses = Group->getNumMembers(); 6437 if (interleavedAccessCanBeWidened(&I, VF)) 6438 InterleaveCost = getInterleaveGroupCost(&I, VF); 6439 } 6440 6441 unsigned GatherScatterCost = 6442 isLegalGatherOrScatter(&I) 6443 ? getGatherScatterCost(&I, VF) * NumAccesses 6444 : std::numeric_limits<unsigned>::max(); 6445 6446 unsigned ScalarizationCost = 6447 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6448 6449 // Choose better solution for the current VF, 6450 // write down this decision and use it during vectorization. 6451 unsigned Cost; 6452 InstWidening Decision; 6453 if (InterleaveCost <= GatherScatterCost && 6454 InterleaveCost < ScalarizationCost) { 6455 Decision = CM_Interleave; 6456 Cost = InterleaveCost; 6457 } else if (GatherScatterCost < ScalarizationCost) { 6458 Decision = CM_GatherScatter; 6459 Cost = GatherScatterCost; 6460 } else { 6461 Decision = CM_Scalarize; 6462 Cost = ScalarizationCost; 6463 } 6464 // If the instructions belongs to an interleave group, the whole group 6465 // receives the same decision. The whole group receives the cost, but 6466 // the cost will actually be assigned to one instruction. 6467 if (auto Group = getInterleavedAccessGroup(&I)) 6468 setWideningDecision(Group, VF, Decision, Cost); 6469 else 6470 setWideningDecision(&I, VF, Decision, Cost); 6471 } 6472 } 6473 6474 // Make sure that any load of address and any other address computation 6475 // remains scalar unless there is gather/scatter support. This avoids 6476 // inevitable extracts into address registers, and also has the benefit of 6477 // activating LSR more, since that pass can't optimize vectorized 6478 // addresses. 6479 if (TTI.prefersVectorizedAddressing()) 6480 return; 6481 6482 // Start with all scalar pointer uses. 6483 SmallPtrSet<Instruction *, 8> AddrDefs; 6484 for (BasicBlock *BB : TheLoop->blocks()) 6485 for (Instruction &I : *BB) { 6486 Instruction *PtrDef = 6487 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6488 if (PtrDef && TheLoop->contains(PtrDef) && 6489 getWideningDecision(&I, VF) != CM_GatherScatter) 6490 AddrDefs.insert(PtrDef); 6491 } 6492 6493 // Add all instructions used to generate the addresses. 6494 SmallVector<Instruction *, 4> Worklist; 6495 for (auto *I : AddrDefs) 6496 Worklist.push_back(I); 6497 while (!Worklist.empty()) { 6498 Instruction *I = Worklist.pop_back_val(); 6499 for (auto &Op : I->operands()) 6500 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6501 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6502 AddrDefs.insert(InstOp).second) 6503 Worklist.push_back(InstOp); 6504 } 6505 6506 for (auto *I : AddrDefs) { 6507 if (isa<LoadInst>(I)) { 6508 // Setting the desired widening decision should ideally be handled in 6509 // by cost functions, but since this involves the task of finding out 6510 // if the loaded register is involved in an address computation, it is 6511 // instead changed here when we know this is the case. 6512 InstWidening Decision = getWideningDecision(I, VF); 6513 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6514 // Scalarize a widened load of address. 6515 setWideningDecision( 6516 I, VF, CM_Scalarize, 6517 (VF.getKnownMinValue() * 6518 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6519 else if (auto Group = getInterleavedAccessGroup(I)) { 6520 // Scalarize an interleave group of address loads. 6521 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6522 if (Instruction *Member = Group->getMember(I)) 6523 setWideningDecision( 6524 Member, VF, CM_Scalarize, 6525 (VF.getKnownMinValue() * 6526 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6527 } 6528 } 6529 } else 6530 // Make sure I gets scalarized and a cost estimate without 6531 // scalarization overhead. 6532 ForcedScalars[VF].insert(I); 6533 } 6534 } 6535 6536 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6537 ElementCount VF, 6538 Type *&VectorTy) { 6539 Type *RetTy = I->getType(); 6540 if (canTruncateToMinimalBitwidth(I, VF)) 6541 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6542 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6543 auto SE = PSE.getSE(); 6544 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6545 6546 // TODO: We need to estimate the cost of intrinsic calls. 6547 switch (I->getOpcode()) { 6548 case Instruction::GetElementPtr: 6549 // We mark this instruction as zero-cost because the cost of GEPs in 6550 // vectorized code depends on whether the corresponding memory instruction 6551 // is scalarized or not. Therefore, we handle GEPs with the memory 6552 // instruction cost. 6553 return 0; 6554 case Instruction::Br: { 6555 // In cases of scalarized and predicated instructions, there will be VF 6556 // predicated blocks in the vectorized loop. Each branch around these 6557 // blocks requires also an extract of its vector compare i1 element. 6558 bool ScalarPredicatedBB = false; 6559 BranchInst *BI = cast<BranchInst>(I); 6560 if (VF.isVector() && BI->isConditional() && 6561 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6562 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6563 ScalarPredicatedBB = true; 6564 6565 if (ScalarPredicatedBB) { 6566 // Return cost for branches around scalarized and predicated blocks. 6567 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6568 auto *Vec_i1Ty = 6569 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6570 return (TTI.getScalarizationOverhead( 6571 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6572 false, true) + 6573 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6574 VF.getKnownMinValue())); 6575 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6576 // The back-edge branch will remain, as will all scalar branches. 6577 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6578 else 6579 // This branch will be eliminated by if-conversion. 6580 return 0; 6581 // Note: We currently assume zero cost for an unconditional branch inside 6582 // a predicated block since it will become a fall-through, although we 6583 // may decide in the future to call TTI for all branches. 6584 } 6585 case Instruction::PHI: { 6586 auto *Phi = cast<PHINode>(I); 6587 6588 // First-order recurrences are replaced by vector shuffles inside the loop. 6589 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6590 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6591 return TTI.getShuffleCost( 6592 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6593 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6594 6595 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6596 // converted into select instructions. We require N - 1 selects per phi 6597 // node, where N is the number of incoming values. 6598 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6599 return (Phi->getNumIncomingValues() - 1) * 6600 TTI.getCmpSelInstrCost( 6601 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6602 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6603 CostKind); 6604 6605 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6606 } 6607 case Instruction::UDiv: 6608 case Instruction::SDiv: 6609 case Instruction::URem: 6610 case Instruction::SRem: 6611 // If we have a predicated instruction, it may not be executed for each 6612 // vector lane. Get the scalarization cost and scale this amount by the 6613 // probability of executing the predicated block. If the instruction is not 6614 // predicated, we fall through to the next case. 6615 if (VF.isVector() && isScalarWithPredication(I)) { 6616 unsigned Cost = 0; 6617 6618 // These instructions have a non-void type, so account for the phi nodes 6619 // that we will create. This cost is likely to be zero. The phi node 6620 // cost, if any, should be scaled by the block probability because it 6621 // models a copy at the end of each predicated block. 6622 Cost += VF.getKnownMinValue() * 6623 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6624 6625 // The cost of the non-predicated instruction. 6626 Cost += VF.getKnownMinValue() * 6627 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6628 6629 // The cost of insertelement and extractelement instructions needed for 6630 // scalarization. 6631 Cost += getScalarizationOverhead(I, VF); 6632 6633 // Scale the cost by the probability of executing the predicated blocks. 6634 // This assumes the predicated block for each vector lane is equally 6635 // likely. 6636 return Cost / getReciprocalPredBlockProb(); 6637 } 6638 LLVM_FALLTHROUGH; 6639 case Instruction::Add: 6640 case Instruction::FAdd: 6641 case Instruction::Sub: 6642 case Instruction::FSub: 6643 case Instruction::Mul: 6644 case Instruction::FMul: 6645 case Instruction::FDiv: 6646 case Instruction::FRem: 6647 case Instruction::Shl: 6648 case Instruction::LShr: 6649 case Instruction::AShr: 6650 case Instruction::And: 6651 case Instruction::Or: 6652 case Instruction::Xor: { 6653 // Since we will replace the stride by 1 the multiplication should go away. 6654 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6655 return 0; 6656 // Certain instructions can be cheaper to vectorize if they have a constant 6657 // second vector operand. One example of this are shifts on x86. 6658 Value *Op2 = I->getOperand(1); 6659 TargetTransformInfo::OperandValueProperties Op2VP; 6660 TargetTransformInfo::OperandValueKind Op2VK = 6661 TTI.getOperandInfo(Op2, Op2VP); 6662 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6663 Op2VK = TargetTransformInfo::OK_UniformValue; 6664 6665 SmallVector<const Value *, 4> Operands(I->operand_values()); 6666 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6667 return N * TTI.getArithmeticInstrCost( 6668 I->getOpcode(), VectorTy, CostKind, 6669 TargetTransformInfo::OK_AnyValue, 6670 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6671 } 6672 case Instruction::FNeg: { 6673 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6674 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6675 return N * TTI.getArithmeticInstrCost( 6676 I->getOpcode(), VectorTy, CostKind, 6677 TargetTransformInfo::OK_AnyValue, 6678 TargetTransformInfo::OK_AnyValue, 6679 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6680 I->getOperand(0), I); 6681 } 6682 case Instruction::Select: { 6683 SelectInst *SI = cast<SelectInst>(I); 6684 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6685 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6686 Type *CondTy = SI->getCondition()->getType(); 6687 if (!ScalarCond) { 6688 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6689 CondTy = VectorType::get(CondTy, VF); 6690 } 6691 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6692 CostKind, I); 6693 } 6694 case Instruction::ICmp: 6695 case Instruction::FCmp: { 6696 Type *ValTy = I->getOperand(0)->getType(); 6697 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6698 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6699 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6700 VectorTy = ToVectorTy(ValTy, VF); 6701 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6702 I); 6703 } 6704 case Instruction::Store: 6705 case Instruction::Load: { 6706 ElementCount Width = VF; 6707 if (Width.isVector()) { 6708 InstWidening Decision = getWideningDecision(I, Width); 6709 assert(Decision != CM_Unknown && 6710 "CM decision should be taken at this point"); 6711 if (Decision == CM_Scalarize) 6712 Width = ElementCount::getFixed(1); 6713 } 6714 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6715 return getMemoryInstructionCost(I, VF); 6716 } 6717 case Instruction::ZExt: 6718 case Instruction::SExt: 6719 case Instruction::FPToUI: 6720 case Instruction::FPToSI: 6721 case Instruction::FPExt: 6722 case Instruction::PtrToInt: 6723 case Instruction::IntToPtr: 6724 case Instruction::SIToFP: 6725 case Instruction::UIToFP: 6726 case Instruction::Trunc: 6727 case Instruction::FPTrunc: 6728 case Instruction::BitCast: { 6729 // Computes the CastContextHint from a Load/Store instruction. 6730 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6731 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6732 "Expected a load or a store!"); 6733 6734 if (VF.isScalar() || !TheLoop->contains(I)) 6735 return TTI::CastContextHint::Normal; 6736 6737 switch (getWideningDecision(I, VF)) { 6738 case LoopVectorizationCostModel::CM_GatherScatter: 6739 return TTI::CastContextHint::GatherScatter; 6740 case LoopVectorizationCostModel::CM_Interleave: 6741 return TTI::CastContextHint::Interleave; 6742 case LoopVectorizationCostModel::CM_Scalarize: 6743 case LoopVectorizationCostModel::CM_Widen: 6744 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6745 : TTI::CastContextHint::Normal; 6746 case LoopVectorizationCostModel::CM_Widen_Reverse: 6747 return TTI::CastContextHint::Reversed; 6748 case LoopVectorizationCostModel::CM_Unknown: 6749 llvm_unreachable("Instr did not go through cost modelling?"); 6750 } 6751 6752 llvm_unreachable("Unhandled case!"); 6753 }; 6754 6755 unsigned Opcode = I->getOpcode(); 6756 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6757 // For Trunc, the context is the only user, which must be a StoreInst. 6758 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6759 if (I->hasOneUse()) 6760 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6761 CCH = ComputeCCH(Store); 6762 } 6763 // For Z/Sext, the context is the operand, which must be a LoadInst. 6764 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6765 Opcode == Instruction::FPExt) { 6766 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6767 CCH = ComputeCCH(Load); 6768 } 6769 6770 // We optimize the truncation of induction variables having constant 6771 // integer steps. The cost of these truncations is the same as the scalar 6772 // operation. 6773 if (isOptimizableIVTruncate(I, VF)) { 6774 auto *Trunc = cast<TruncInst>(I); 6775 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6776 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6777 } 6778 6779 Type *SrcScalarTy = I->getOperand(0)->getType(); 6780 Type *SrcVecTy = 6781 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6782 if (canTruncateToMinimalBitwidth(I, VF)) { 6783 // This cast is going to be shrunk. This may remove the cast or it might 6784 // turn it into slightly different cast. For example, if MinBW == 16, 6785 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6786 // 6787 // Calculate the modified src and dest types. 6788 Type *MinVecTy = VectorTy; 6789 if (Opcode == Instruction::Trunc) { 6790 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6791 VectorTy = 6792 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6793 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6794 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6795 VectorTy = 6796 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6797 } 6798 } 6799 6800 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6801 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6802 return N * 6803 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6804 } 6805 case Instruction::Call: { 6806 bool NeedToScalarize; 6807 CallInst *CI = cast<CallInst>(I); 6808 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6809 if (getVectorIntrinsicIDForCall(CI, TLI)) 6810 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6811 return CallCost; 6812 } 6813 default: 6814 // The cost of executing VF copies of the scalar instruction. This opcode 6815 // is unknown. Assume that it is the same as 'mul'. 6816 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6817 Instruction::Mul, VectorTy, CostKind) + 6818 getScalarizationOverhead(I, VF); 6819 } // end of switch. 6820 } 6821 6822 char LoopVectorize::ID = 0; 6823 6824 static const char lv_name[] = "Loop Vectorization"; 6825 6826 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6827 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6828 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6829 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6830 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6831 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6832 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6833 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6834 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6835 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6836 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6837 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6838 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6839 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6840 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6841 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6842 6843 namespace llvm { 6844 6845 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6846 6847 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6848 bool VectorizeOnlyWhenForced) { 6849 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6850 } 6851 6852 } // end namespace llvm 6853 6854 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6855 // Check if the pointer operand of a load or store instruction is 6856 // consecutive. 6857 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6858 return Legal->isConsecutivePtr(Ptr); 6859 return false; 6860 } 6861 6862 void LoopVectorizationCostModel::collectValuesToIgnore() { 6863 // Ignore ephemeral values. 6864 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6865 6866 // Ignore type-promoting instructions we identified during reduction 6867 // detection. 6868 for (auto &Reduction : Legal->getReductionVars()) { 6869 RecurrenceDescriptor &RedDes = Reduction.second; 6870 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6871 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6872 } 6873 // Ignore type-casting instructions we identified during induction 6874 // detection. 6875 for (auto &Induction : Legal->getInductionVars()) { 6876 InductionDescriptor &IndDes = Induction.second; 6877 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6878 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6879 } 6880 } 6881 6882 void LoopVectorizationCostModel::collectInLoopReductions() { 6883 // For the moment, without predicated reduction instructions, we do not 6884 // support inloop reductions whilst folding the tail, and hence in those cases 6885 // all reductions are currently out of the loop. 6886 if (!PreferInLoopReductions || foldTailByMasking()) 6887 return; 6888 6889 for (auto &Reduction : Legal->getReductionVars()) { 6890 PHINode *Phi = Reduction.first; 6891 RecurrenceDescriptor &RdxDesc = Reduction.second; 6892 6893 // We don't collect reductions that are type promoted (yet). 6894 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6895 continue; 6896 6897 // Check that we can correctly put the reductions into the loop, by 6898 // finding the chain of operations that leads from the phi to the loop 6899 // exit value. 6900 SmallVector<Instruction *, 4> ReductionOperations = 6901 RdxDesc.getReductionOpChain(Phi, TheLoop); 6902 bool InLoop = !ReductionOperations.empty(); 6903 if (InLoop) 6904 InLoopReductionChains[Phi] = ReductionOperations; 6905 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6906 << " reduction for phi: " << *Phi << "\n"); 6907 } 6908 } 6909 6910 // TODO: we could return a pair of values that specify the max VF and 6911 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6912 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6913 // doesn't have a cost model that can choose which plan to execute if 6914 // more than one is generated. 6915 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6916 LoopVectorizationCostModel &CM) { 6917 unsigned WidestType; 6918 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6919 return WidestVectorRegBits / WidestType; 6920 } 6921 6922 VectorizationFactor 6923 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6924 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6925 ElementCount VF = UserVF; 6926 // Outer loop handling: They may require CFG and instruction level 6927 // transformations before even evaluating whether vectorization is profitable. 6928 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6929 // the vectorization pipeline. 6930 if (!OrigLoop->empty()) { 6931 // If the user doesn't provide a vectorization factor, determine a 6932 // reasonable one. 6933 if (UserVF.isZero()) { 6934 VF = ElementCount::getFixed( 6935 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6936 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6937 6938 // Make sure we have a VF > 1 for stress testing. 6939 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6940 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6941 << "overriding computed VF.\n"); 6942 VF = ElementCount::getFixed(4); 6943 } 6944 } 6945 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6946 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6947 "VF needs to be a power of two"); 6948 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6949 << "VF " << VF << " to build VPlans.\n"); 6950 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6951 6952 // For VPlan build stress testing, we bail out after VPlan construction. 6953 if (VPlanBuildStressTest) 6954 return VectorizationFactor::Disabled(); 6955 6956 return {VF, 0 /*Cost*/}; 6957 } 6958 6959 LLVM_DEBUG( 6960 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6961 "VPlan-native path.\n"); 6962 return VectorizationFactor::Disabled(); 6963 } 6964 6965 Optional<VectorizationFactor> 6966 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6967 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6968 assert(OrigLoop->empty() && "Inner loop expected."); 6969 Optional<unsigned> MaybeMaxVF = 6970 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6971 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6972 return None; 6973 6974 // Invalidate interleave groups if all blocks of loop will be predicated. 6975 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6976 !useMaskedInterleavedAccesses(*TTI)) { 6977 LLVM_DEBUG( 6978 dbgs() 6979 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6980 "which requires masked-interleaved support.\n"); 6981 if (CM.InterleaveInfo.invalidateGroups()) 6982 // Invalidating interleave groups also requires invalidating all decisions 6983 // based on them, which includes widening decisions and uniform and scalar 6984 // values. 6985 CM.invalidateCostModelingDecisions(); 6986 } 6987 6988 if (!UserVF.isZero()) { 6989 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6990 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6991 "VF needs to be a power of two"); 6992 // Collect the instructions (and their associated costs) that will be more 6993 // profitable to scalarize. 6994 CM.selectUserVectorizationFactor(UserVF); 6995 CM.collectInLoopReductions(); 6996 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 6997 UserVF.getKnownMinValue()); 6998 LLVM_DEBUG(printPlans(dbgs())); 6999 return {{UserVF, 0}}; 7000 } 7001 7002 unsigned MaxVF = MaybeMaxVF.getValue(); 7003 assert(MaxVF != 0 && "MaxVF is zero."); 7004 7005 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 7006 // Collect Uniform and Scalar instructions after vectorization with VF. 7007 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 7008 7009 // Collect the instructions (and their associated costs) that will be more 7010 // profitable to scalarize. 7011 if (VF > 1) 7012 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 7013 } 7014 7015 CM.collectInLoopReductions(); 7016 7017 buildVPlansWithVPRecipes(1, MaxVF); 7018 LLVM_DEBUG(printPlans(dbgs())); 7019 if (MaxVF == 1) 7020 return VectorizationFactor::Disabled(); 7021 7022 // Select the optimal vectorization factor. 7023 return CM.selectVectorizationFactor(MaxVF); 7024 } 7025 7026 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7027 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7028 << '\n'); 7029 BestVF = VF; 7030 BestUF = UF; 7031 7032 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7033 return !Plan->hasVF(VF); 7034 }); 7035 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7036 } 7037 7038 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7039 DominatorTree *DT) { 7040 // Perform the actual loop transformation. 7041 7042 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7043 VPCallbackILV CallbackILV(ILV); 7044 7045 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7046 7047 VPTransformState State{*BestVF, BestUF, LI, 7048 DT, ILV.Builder, ILV.VectorLoopValueMap, 7049 &ILV, CallbackILV}; 7050 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7051 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7052 State.CanonicalIV = ILV.Induction; 7053 7054 //===------------------------------------------------===// 7055 // 7056 // Notice: any optimization or new instruction that go 7057 // into the code below should also be implemented in 7058 // the cost-model. 7059 // 7060 //===------------------------------------------------===// 7061 7062 // 2. Copy and widen instructions from the old loop into the new loop. 7063 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7064 VPlans.front()->execute(&State); 7065 7066 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7067 // predication, updating analyses. 7068 ILV.fixVectorizedLoop(); 7069 } 7070 7071 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7072 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7073 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7074 7075 // We create new control-flow for the vectorized loop, so the original 7076 // condition will be dead after vectorization if it's only used by the 7077 // branch. 7078 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7079 if (Cmp && Cmp->hasOneUse()) 7080 DeadInstructions.insert(Cmp); 7081 7082 // We create new "steps" for induction variable updates to which the original 7083 // induction variables map. An original update instruction will be dead if 7084 // all its users except the induction variable are dead. 7085 for (auto &Induction : Legal->getInductionVars()) { 7086 PHINode *Ind = Induction.first; 7087 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7088 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7089 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7090 })) 7091 DeadInstructions.insert(IndUpdate); 7092 7093 // We record as "Dead" also the type-casting instructions we had identified 7094 // during induction analysis. We don't need any handling for them in the 7095 // vectorized loop because we have proven that, under a proper runtime 7096 // test guarding the vectorized loop, the value of the phi, and the casted 7097 // value of the phi, are the same. The last instruction in this casting chain 7098 // will get its scalar/vector/widened def from the scalar/vector/widened def 7099 // of the respective phi node. Any other casts in the induction def-use chain 7100 // have no other uses outside the phi update chain, and will be ignored. 7101 InductionDescriptor &IndDes = Induction.second; 7102 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7103 DeadInstructions.insert(Casts.begin(), Casts.end()); 7104 } 7105 } 7106 7107 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7108 7109 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7110 7111 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7112 Instruction::BinaryOps BinOp) { 7113 // When unrolling and the VF is 1, we only need to add a simple scalar. 7114 Type *Ty = Val->getType(); 7115 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7116 7117 if (Ty->isFloatingPointTy()) { 7118 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7119 7120 // Floating point operations had to be 'fast' to enable the unrolling. 7121 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7122 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7123 } 7124 Constant *C = ConstantInt::get(Ty, StartIdx); 7125 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7126 } 7127 7128 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7129 SmallVector<Metadata *, 4> MDs; 7130 // Reserve first location for self reference to the LoopID metadata node. 7131 MDs.push_back(nullptr); 7132 bool IsUnrollMetadata = false; 7133 MDNode *LoopID = L->getLoopID(); 7134 if (LoopID) { 7135 // First find existing loop unrolling disable metadata. 7136 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7137 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7138 if (MD) { 7139 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7140 IsUnrollMetadata = 7141 S && S->getString().startswith("llvm.loop.unroll.disable"); 7142 } 7143 MDs.push_back(LoopID->getOperand(i)); 7144 } 7145 } 7146 7147 if (!IsUnrollMetadata) { 7148 // Add runtime unroll disable metadata. 7149 LLVMContext &Context = L->getHeader()->getContext(); 7150 SmallVector<Metadata *, 1> DisableOperands; 7151 DisableOperands.push_back( 7152 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7153 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7154 MDs.push_back(DisableNode); 7155 MDNode *NewLoopID = MDNode::get(Context, MDs); 7156 // Set operand 0 to refer to the loop id itself. 7157 NewLoopID->replaceOperandWith(0, NewLoopID); 7158 L->setLoopID(NewLoopID); 7159 } 7160 } 7161 7162 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7163 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7164 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7165 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7166 7167 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7168 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7169 Range.End = TmpVF; 7170 break; 7171 } 7172 7173 return PredicateAtRangeStart; 7174 } 7175 7176 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7177 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7178 /// of VF's starting at a given VF and extending it as much as possible. Each 7179 /// vectorization decision can potentially shorten this sub-range during 7180 /// buildVPlan(). 7181 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7182 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7183 VFRange SubRange = {VF, MaxVF + 1}; 7184 VPlans.push_back(buildVPlan(SubRange)); 7185 VF = SubRange.End; 7186 } 7187 } 7188 7189 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7190 VPlanPtr &Plan) { 7191 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7192 7193 // Look for cached value. 7194 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7195 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7196 if (ECEntryIt != EdgeMaskCache.end()) 7197 return ECEntryIt->second; 7198 7199 VPValue *SrcMask = createBlockInMask(Src, Plan); 7200 7201 // The terminator has to be a branch inst! 7202 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7203 assert(BI && "Unexpected terminator found"); 7204 7205 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7206 return EdgeMaskCache[Edge] = SrcMask; 7207 7208 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7209 assert(EdgeMask && "No Edge Mask found for condition"); 7210 7211 if (BI->getSuccessor(0) != Dst) 7212 EdgeMask = Builder.createNot(EdgeMask); 7213 7214 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7215 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7216 7217 return EdgeMaskCache[Edge] = EdgeMask; 7218 } 7219 7220 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7221 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7222 7223 // Look for cached value. 7224 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7225 if (BCEntryIt != BlockMaskCache.end()) 7226 return BCEntryIt->second; 7227 7228 // All-one mask is modelled as no-mask following the convention for masked 7229 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7230 VPValue *BlockMask = nullptr; 7231 7232 if (OrigLoop->getHeader() == BB) { 7233 if (!CM.blockNeedsPredication(BB)) 7234 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7235 7236 // Introduce the early-exit compare IV <= BTC to form header block mask. 7237 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7238 // Start by constructing the desired canonical IV. 7239 VPValue *IV = nullptr; 7240 if (Legal->getPrimaryInduction()) 7241 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7242 else { 7243 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7244 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7245 IV = IVRecipe->getVPValue(); 7246 } 7247 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7248 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7249 7250 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7251 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7252 // as a second argument, we only pass the IV here and extract the 7253 // tripcount from the transform state where codegen of the VP instructions 7254 // happen. 7255 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7256 } else { 7257 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7258 } 7259 return BlockMaskCache[BB] = BlockMask; 7260 } 7261 7262 // This is the block mask. We OR all incoming edges. 7263 for (auto *Predecessor : predecessors(BB)) { 7264 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7265 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7266 return BlockMaskCache[BB] = EdgeMask; 7267 7268 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7269 BlockMask = EdgeMask; 7270 continue; 7271 } 7272 7273 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7274 } 7275 7276 return BlockMaskCache[BB] = BlockMask; 7277 } 7278 7279 VPWidenMemoryInstructionRecipe * 7280 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7281 VPlanPtr &Plan) { 7282 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7283 "Must be called with either a load or store"); 7284 7285 auto willWiden = [&](ElementCount VF) -> bool { 7286 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7287 if (VF.isScalar()) 7288 return false; 7289 LoopVectorizationCostModel::InstWidening Decision = 7290 CM.getWideningDecision(I, VF); 7291 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7292 "CM decision should be taken at this point."); 7293 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7294 return true; 7295 if (CM.isScalarAfterVectorization(I, VF) || 7296 CM.isProfitableToScalarize(I, VF)) 7297 return false; 7298 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7299 }; 7300 7301 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7302 return nullptr; 7303 7304 VPValue *Mask = nullptr; 7305 if (Legal->isMaskRequired(I)) 7306 Mask = createBlockInMask(I->getParent(), Plan); 7307 7308 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7309 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7310 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7311 7312 StoreInst *Store = cast<StoreInst>(I); 7313 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7314 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7315 } 7316 7317 VPWidenIntOrFpInductionRecipe * 7318 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7319 // Check if this is an integer or fp induction. If so, build the recipe that 7320 // produces its scalar and vector values. 7321 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7322 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7323 II.getKind() == InductionDescriptor::IK_FpInduction) 7324 return new VPWidenIntOrFpInductionRecipe(Phi); 7325 7326 return nullptr; 7327 } 7328 7329 VPWidenIntOrFpInductionRecipe * 7330 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7331 VFRange &Range) const { 7332 // Optimize the special case where the source is a constant integer 7333 // induction variable. Notice that we can only optimize the 'trunc' case 7334 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7335 // (c) other casts depend on pointer size. 7336 7337 // Determine whether \p K is a truncation based on an induction variable that 7338 // can be optimized. 7339 auto isOptimizableIVTruncate = 7340 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7341 return [=](ElementCount VF) -> bool { 7342 return CM.isOptimizableIVTruncate(K, VF); 7343 }; 7344 }; 7345 7346 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7347 isOptimizableIVTruncate(I), Range)) 7348 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7349 I); 7350 return nullptr; 7351 } 7352 7353 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7354 // We know that all PHIs in non-header blocks are converted into selects, so 7355 // we don't have to worry about the insertion order and we can just use the 7356 // builder. At this point we generate the predication tree. There may be 7357 // duplications since this is a simple recursive scan, but future 7358 // optimizations will clean it up. 7359 7360 SmallVector<VPValue *, 2> Operands; 7361 unsigned NumIncoming = Phi->getNumIncomingValues(); 7362 for (unsigned In = 0; In < NumIncoming; In++) { 7363 VPValue *EdgeMask = 7364 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7365 assert((EdgeMask || NumIncoming == 1) && 7366 "Multiple predecessors with one having a full mask"); 7367 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7368 if (EdgeMask) 7369 Operands.push_back(EdgeMask); 7370 } 7371 return new VPBlendRecipe(Phi, Operands); 7372 } 7373 7374 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7375 VPlan &Plan) const { 7376 7377 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7378 [this, CI](ElementCount VF) { 7379 return CM.isScalarWithPredication(CI, VF); 7380 }, 7381 Range); 7382 7383 if (IsPredicated) 7384 return nullptr; 7385 7386 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7387 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7388 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7389 return nullptr; 7390 7391 auto willWiden = [&](ElementCount VF) -> bool { 7392 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7393 // The following case may be scalarized depending on the VF. 7394 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7395 // version of the instruction. 7396 // Is it beneficial to perform intrinsic call compared to lib call? 7397 bool NeedToScalarize = false; 7398 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7399 bool UseVectorIntrinsic = 7400 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7401 return UseVectorIntrinsic || !NeedToScalarize; 7402 }; 7403 7404 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7405 return nullptr; 7406 7407 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7408 } 7409 7410 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7411 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7412 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7413 // Instruction should be widened, unless it is scalar after vectorization, 7414 // scalarization is profitable or it is predicated. 7415 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7416 return CM.isScalarAfterVectorization(I, VF) || 7417 CM.isProfitableToScalarize(I, VF) || 7418 CM.isScalarWithPredication(I, VF); 7419 }; 7420 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7421 Range); 7422 } 7423 7424 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7425 auto IsVectorizableOpcode = [](unsigned Opcode) { 7426 switch (Opcode) { 7427 case Instruction::Add: 7428 case Instruction::And: 7429 case Instruction::AShr: 7430 case Instruction::BitCast: 7431 case Instruction::FAdd: 7432 case Instruction::FCmp: 7433 case Instruction::FDiv: 7434 case Instruction::FMul: 7435 case Instruction::FNeg: 7436 case Instruction::FPExt: 7437 case Instruction::FPToSI: 7438 case Instruction::FPToUI: 7439 case Instruction::FPTrunc: 7440 case Instruction::FRem: 7441 case Instruction::FSub: 7442 case Instruction::ICmp: 7443 case Instruction::IntToPtr: 7444 case Instruction::LShr: 7445 case Instruction::Mul: 7446 case Instruction::Or: 7447 case Instruction::PtrToInt: 7448 case Instruction::SDiv: 7449 case Instruction::Select: 7450 case Instruction::SExt: 7451 case Instruction::Shl: 7452 case Instruction::SIToFP: 7453 case Instruction::SRem: 7454 case Instruction::Sub: 7455 case Instruction::Trunc: 7456 case Instruction::UDiv: 7457 case Instruction::UIToFP: 7458 case Instruction::URem: 7459 case Instruction::Xor: 7460 case Instruction::ZExt: 7461 return true; 7462 } 7463 return false; 7464 }; 7465 7466 if (!IsVectorizableOpcode(I->getOpcode())) 7467 return nullptr; 7468 7469 // Success: widen this instruction. 7470 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7471 } 7472 7473 VPBasicBlock *VPRecipeBuilder::handleReplication( 7474 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7475 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7476 VPlanPtr &Plan) { 7477 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7478 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7479 Range); 7480 7481 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7482 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7483 Range); 7484 7485 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7486 IsUniform, IsPredicated); 7487 setRecipe(I, Recipe); 7488 7489 // Find if I uses a predicated instruction. If so, it will use its scalar 7490 // value. Avoid hoisting the insert-element which packs the scalar value into 7491 // a vector value, as that happens iff all users use the vector value. 7492 for (auto &Op : I->operands()) 7493 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7494 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7495 PredInst2Recipe[PredInst]->setAlsoPack(false); 7496 7497 // Finalize the recipe for Instr, first if it is not predicated. 7498 if (!IsPredicated) { 7499 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7500 VPBB->appendRecipe(Recipe); 7501 return VPBB; 7502 } 7503 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7504 assert(VPBB->getSuccessors().empty() && 7505 "VPBB has successors when handling predicated replication."); 7506 // Record predicated instructions for above packing optimizations. 7507 PredInst2Recipe[I] = Recipe; 7508 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7509 VPBlockUtils::insertBlockAfter(Region, VPBB); 7510 auto *RegSucc = new VPBasicBlock(); 7511 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7512 return RegSucc; 7513 } 7514 7515 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7516 VPRecipeBase *PredRecipe, 7517 VPlanPtr &Plan) { 7518 // Instructions marked for predication are replicated and placed under an 7519 // if-then construct to prevent side-effects. 7520 7521 // Generate recipes to compute the block mask for this region. 7522 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7523 7524 // Build the triangular if-then region. 7525 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7526 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7527 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7528 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7529 auto *PHIRecipe = 7530 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7531 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7532 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7533 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7534 7535 // Note: first set Entry as region entry and then connect successors starting 7536 // from it in order, to propagate the "parent" of each VPBasicBlock. 7537 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7538 VPBlockUtils::connectBlocks(Pred, Exit); 7539 7540 return Region; 7541 } 7542 7543 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7544 VFRange &Range, 7545 VPlanPtr &Plan) { 7546 // First, check for specific widening recipes that deal with calls, memory 7547 // operations, inductions and Phi nodes. 7548 if (auto *CI = dyn_cast<CallInst>(Instr)) 7549 return tryToWidenCall(CI, Range, *Plan); 7550 7551 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7552 return tryToWidenMemory(Instr, Range, Plan); 7553 7554 VPRecipeBase *Recipe; 7555 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7556 if (Phi->getParent() != OrigLoop->getHeader()) 7557 return tryToBlend(Phi, Plan); 7558 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7559 return Recipe; 7560 return new VPWidenPHIRecipe(Phi); 7561 } 7562 7563 if (isa<TruncInst>(Instr) && 7564 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7565 return Recipe; 7566 7567 if (!shouldWiden(Instr, Range)) 7568 return nullptr; 7569 7570 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7571 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7572 OrigLoop); 7573 7574 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7575 bool InvariantCond = 7576 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7577 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7578 InvariantCond); 7579 } 7580 7581 return tryToWiden(Instr, *Plan); 7582 } 7583 7584 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7585 unsigned MaxVF) { 7586 assert(OrigLoop->empty() && "Inner loop expected."); 7587 7588 // Collect conditions feeding internal conditional branches; they need to be 7589 // represented in VPlan for it to model masking. 7590 SmallPtrSet<Value *, 1> NeedDef; 7591 7592 auto *Latch = OrigLoop->getLoopLatch(); 7593 for (BasicBlock *BB : OrigLoop->blocks()) { 7594 if (BB == Latch) 7595 continue; 7596 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7597 if (Branch && Branch->isConditional()) 7598 NeedDef.insert(Branch->getCondition()); 7599 } 7600 7601 // If the tail is to be folded by masking, the primary induction variable, if 7602 // exists needs to be represented in VPlan for it to model early-exit masking. 7603 // Also, both the Phi and the live-out instruction of each reduction are 7604 // required in order to introduce a select between them in VPlan. 7605 if (CM.foldTailByMasking()) { 7606 if (Legal->getPrimaryInduction()) 7607 NeedDef.insert(Legal->getPrimaryInduction()); 7608 for (auto &Reduction : Legal->getReductionVars()) { 7609 NeedDef.insert(Reduction.first); 7610 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7611 } 7612 } 7613 7614 // Collect instructions from the original loop that will become trivially dead 7615 // in the vectorized loop. We don't need to vectorize these instructions. For 7616 // example, original induction update instructions can become dead because we 7617 // separately emit induction "steps" when generating code for the new loop. 7618 // Similarly, we create a new latch condition when setting up the structure 7619 // of the new loop, so the old one can become dead. 7620 SmallPtrSet<Instruction *, 4> DeadInstructions; 7621 collectTriviallyDeadInstructions(DeadInstructions); 7622 7623 // Add assume instructions we need to drop to DeadInstructions, to prevent 7624 // them from being added to the VPlan. 7625 // TODO: We only need to drop assumes in blocks that get flattend. If the 7626 // control flow is preserved, we should keep them. 7627 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7628 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7629 7630 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7631 // Dead instructions do not need sinking. Remove them from SinkAfter. 7632 for (Instruction *I : DeadInstructions) 7633 SinkAfter.erase(I); 7634 7635 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7636 VFRange SubRange = {VF, MaxVF + 1}; 7637 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7638 DeadInstructions, SinkAfter)); 7639 VF = SubRange.End; 7640 } 7641 } 7642 7643 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7644 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7645 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7646 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7647 7648 // Hold a mapping from predicated instructions to their recipes, in order to 7649 // fix their AlsoPack behavior if a user is determined to replicate and use a 7650 // scalar instead of vector value. 7651 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7652 7653 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7654 7655 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7656 7657 // --------------------------------------------------------------------------- 7658 // Pre-construction: record ingredients whose recipes we'll need to further 7659 // process after constructing the initial VPlan. 7660 // --------------------------------------------------------------------------- 7661 7662 // Mark instructions we'll need to sink later and their targets as 7663 // ingredients whose recipe we'll need to record. 7664 for (auto &Entry : SinkAfter) { 7665 RecipeBuilder.recordRecipeOf(Entry.first); 7666 RecipeBuilder.recordRecipeOf(Entry.second); 7667 } 7668 for (auto &Reduction : CM.getInLoopReductionChains()) { 7669 PHINode *Phi = Reduction.first; 7670 RecurrenceDescriptor::RecurrenceKind Kind = 7671 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7672 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7673 7674 RecipeBuilder.recordRecipeOf(Phi); 7675 for (auto &R : ReductionOperations) { 7676 RecipeBuilder.recordRecipeOf(R); 7677 // For min/max reducitons, where we have a pair of icmp/select, we also 7678 // need to record the ICmp recipe, so it can be removed later. 7679 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7680 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7681 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7682 } 7683 } 7684 } 7685 7686 // For each interleave group which is relevant for this (possibly trimmed) 7687 // Range, add it to the set of groups to be later applied to the VPlan and add 7688 // placeholders for its members' Recipes which we'll be replacing with a 7689 // single VPInterleaveRecipe. 7690 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7691 auto applyIG = [IG, this](ElementCount VF) -> bool { 7692 return (VF.isVector() && // Query is illegal for VF == 1 7693 CM.getWideningDecision(IG->getInsertPos(), VF) == 7694 LoopVectorizationCostModel::CM_Interleave); 7695 }; 7696 if (!getDecisionAndClampRange(applyIG, Range)) 7697 continue; 7698 InterleaveGroups.insert(IG); 7699 for (unsigned i = 0; i < IG->getFactor(); i++) 7700 if (Instruction *Member = IG->getMember(i)) 7701 RecipeBuilder.recordRecipeOf(Member); 7702 }; 7703 7704 // --------------------------------------------------------------------------- 7705 // Build initial VPlan: Scan the body of the loop in a topological order to 7706 // visit each basic block after having visited its predecessor basic blocks. 7707 // --------------------------------------------------------------------------- 7708 7709 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7710 auto Plan = std::make_unique<VPlan>(); 7711 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7712 Plan->setEntry(VPBB); 7713 7714 // Represent values that will have defs inside VPlan. 7715 for (Value *V : NeedDef) 7716 Plan->addVPValue(V); 7717 7718 // Scan the body of the loop in a topological order to visit each basic block 7719 // after having visited its predecessor basic blocks. 7720 LoopBlocksDFS DFS(OrigLoop); 7721 DFS.perform(LI); 7722 7723 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7724 // Relevant instructions from basic block BB will be grouped into VPRecipe 7725 // ingredients and fill a new VPBasicBlock. 7726 unsigned VPBBsForBB = 0; 7727 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7728 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7729 VPBB = FirstVPBBForBB; 7730 Builder.setInsertPoint(VPBB); 7731 7732 // Introduce each ingredient into VPlan. 7733 // TODO: Model and preserve debug instrinsics in VPlan. 7734 for (Instruction &I : BB->instructionsWithoutDebug()) { 7735 Instruction *Instr = &I; 7736 7737 // First filter out irrelevant instructions, to ensure no recipes are 7738 // built for them. 7739 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7740 continue; 7741 7742 if (auto Recipe = 7743 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7744 RecipeBuilder.setRecipe(Instr, Recipe); 7745 VPBB->appendRecipe(Recipe); 7746 continue; 7747 } 7748 7749 // Otherwise, if all widening options failed, Instruction is to be 7750 // replicated. This may create a successor for VPBB. 7751 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7752 Instr, Range, VPBB, PredInst2Recipe, Plan); 7753 if (NextVPBB != VPBB) { 7754 VPBB = NextVPBB; 7755 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7756 : ""); 7757 } 7758 } 7759 } 7760 7761 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7762 // may also be empty, such as the last one VPBB, reflecting original 7763 // basic-blocks with no recipes. 7764 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7765 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7766 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7767 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7768 delete PreEntry; 7769 7770 // --------------------------------------------------------------------------- 7771 // Transform initial VPlan: Apply previously taken decisions, in order, to 7772 // bring the VPlan to its final state. 7773 // --------------------------------------------------------------------------- 7774 7775 // Apply Sink-After legal constraints. 7776 for (auto &Entry : SinkAfter) { 7777 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7778 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7779 Sink->moveAfter(Target); 7780 } 7781 7782 // Interleave memory: for each Interleave Group we marked earlier as relevant 7783 // for this VPlan, replace the Recipes widening its memory instructions with a 7784 // single VPInterleaveRecipe at its insertion point. 7785 for (auto IG : InterleaveGroups) { 7786 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7787 RecipeBuilder.getRecipe(IG->getInsertPos())); 7788 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7789 ->insertBefore(Recipe); 7790 7791 for (unsigned i = 0; i < IG->getFactor(); ++i) 7792 if (Instruction *Member = IG->getMember(i)) { 7793 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7794 } 7795 } 7796 7797 // Adjust the recipes for any inloop reductions. 7798 if (Range.Start > 1) 7799 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7800 7801 // Finally, if tail is folded by masking, introduce selects between the phi 7802 // and the live-out instruction of each reduction, at the end of the latch. 7803 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7804 Builder.setInsertPoint(VPBB); 7805 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7806 for (auto &Reduction : Legal->getReductionVars()) { 7807 assert(!CM.isInLoopReduction(Reduction.first) && 7808 "Didn't expect inloop tail folded reduction yet!"); 7809 VPValue *Phi = Plan->getVPValue(Reduction.first); 7810 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7811 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7812 } 7813 } 7814 7815 std::string PlanName; 7816 raw_string_ostream RSO(PlanName); 7817 ElementCount VF = ElementCount::getFixed(Range.Start); 7818 Plan->addVF(VF); 7819 RSO << "Initial VPlan for VF={" << VF; 7820 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7821 Plan->addVF(VF); 7822 RSO << "," << VF; 7823 } 7824 RSO << "},UF>=1"; 7825 RSO.flush(); 7826 Plan->setName(PlanName); 7827 7828 return Plan; 7829 } 7830 7831 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7832 // Outer loop handling: They may require CFG and instruction level 7833 // transformations before even evaluating whether vectorization is profitable. 7834 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7835 // the vectorization pipeline. 7836 assert(!OrigLoop->empty()); 7837 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7838 7839 // Create new empty VPlan 7840 auto Plan = std::make_unique<VPlan>(); 7841 7842 // Build hierarchical CFG 7843 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7844 HCFGBuilder.buildHierarchicalCFG(); 7845 7846 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7847 Plan->addVF(ElementCount::getFixed(VF)); 7848 7849 if (EnableVPlanPredication) { 7850 VPlanPredicator VPP(*Plan); 7851 VPP.predicate(); 7852 7853 // Avoid running transformation to recipes until masked code generation in 7854 // VPlan-native path is in place. 7855 return Plan; 7856 } 7857 7858 SmallPtrSet<Instruction *, 1> DeadInstructions; 7859 VPlanTransforms::VPInstructionsToVPRecipes( 7860 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7861 return Plan; 7862 } 7863 7864 // Adjust the recipes for any inloop reductions. The chain of instructions 7865 // leading from the loop exit instr to the phi need to be converted to 7866 // reductions, with one operand being vector and the other being the scalar 7867 // reduction chain. 7868 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7869 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7870 for (auto &Reduction : CM.getInLoopReductionChains()) { 7871 PHINode *Phi = Reduction.first; 7872 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7873 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7874 7875 // ReductionOperations are orders top-down from the phi's use to the 7876 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7877 // which of the two operands will remain scalar and which will be reduced. 7878 // For minmax the chain will be the select instructions. 7879 Instruction *Chain = Phi; 7880 for (Instruction *R : ReductionOperations) { 7881 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7882 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7883 7884 VPValue *ChainOp = Plan->getVPValue(Chain); 7885 unsigned FirstOpId; 7886 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7887 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7888 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7889 "Expected to replace a VPWidenSelectSC"); 7890 FirstOpId = 1; 7891 } else { 7892 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7893 "Expected to replace a VPWidenSC"); 7894 FirstOpId = 0; 7895 } 7896 unsigned VecOpId = 7897 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7898 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7899 7900 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7901 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7902 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7903 WidenRecipe->eraseFromParent(); 7904 7905 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7906 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7907 VPRecipeBase *CompareRecipe = 7908 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7909 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7910 "Expected to replace a VPWidenSC"); 7911 CompareRecipe->eraseFromParent(); 7912 } 7913 Chain = R; 7914 } 7915 } 7916 } 7917 7918 Value* LoopVectorizationPlanner::VPCallbackILV:: 7919 getOrCreateVectorValues(Value *V, unsigned Part) { 7920 return ILV.getOrCreateVectorValue(V, Part); 7921 } 7922 7923 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7924 Value *V, const VPIteration &Instance) { 7925 return ILV.getOrCreateScalarValue(V, Instance); 7926 } 7927 7928 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7929 VPSlotTracker &SlotTracker) const { 7930 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7931 IG->getInsertPos()->printAsOperand(O, false); 7932 O << ", "; 7933 getAddr()->printAsOperand(O, SlotTracker); 7934 VPValue *Mask = getMask(); 7935 if (Mask) { 7936 O << ", "; 7937 Mask->printAsOperand(O, SlotTracker); 7938 } 7939 for (unsigned i = 0; i < IG->getFactor(); ++i) 7940 if (Instruction *I = IG->getMember(i)) 7941 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7942 } 7943 7944 void VPWidenCallRecipe::execute(VPTransformState &State) { 7945 State.ILV->widenCallInstruction(Ingredient, User, State); 7946 } 7947 7948 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7949 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7950 } 7951 7952 void VPWidenRecipe::execute(VPTransformState &State) { 7953 State.ILV->widenInstruction(Ingredient, User, State); 7954 } 7955 7956 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7957 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7958 IsIndexLoopInvariant, State); 7959 } 7960 7961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7962 assert(!State.Instance && "Int or FP induction being replicated."); 7963 State.ILV->widenIntOrFpInduction(IV, Trunc); 7964 } 7965 7966 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7967 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7968 } 7969 7970 void VPBlendRecipe::execute(VPTransformState &State) { 7971 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7972 // We know that all PHIs in non-header blocks are converted into 7973 // selects, so we don't have to worry about the insertion order and we 7974 // can just use the builder. 7975 // At this point we generate the predication tree. There may be 7976 // duplications since this is a simple recursive scan, but future 7977 // optimizations will clean it up. 7978 7979 unsigned NumIncoming = getNumIncomingValues(); 7980 7981 // Generate a sequence of selects of the form: 7982 // SELECT(Mask3, In3, 7983 // SELECT(Mask2, In2, 7984 // SELECT(Mask1, In1, 7985 // In0))) 7986 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7987 // are essentially undef are taken from In0. 7988 InnerLoopVectorizer::VectorParts Entry(State.UF); 7989 for (unsigned In = 0; In < NumIncoming; ++In) { 7990 for (unsigned Part = 0; Part < State.UF; ++Part) { 7991 // We might have single edge PHIs (blocks) - use an identity 7992 // 'select' for the first PHI operand. 7993 Value *In0 = State.get(getIncomingValue(In), Part); 7994 if (In == 0) 7995 Entry[Part] = In0; // Initialize with the first incoming value. 7996 else { 7997 // Select between the current value and the previous incoming edge 7998 // based on the incoming mask. 7999 Value *Cond = State.get(getMask(In), Part); 8000 Entry[Part] = 8001 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8002 } 8003 } 8004 } 8005 for (unsigned Part = 0; Part < State.UF; ++Part) 8006 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8007 } 8008 8009 void VPInterleaveRecipe::execute(VPTransformState &State) { 8010 assert(!State.Instance && "Interleave group being replicated."); 8011 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 8012 } 8013 8014 void VPReductionRecipe::execute(VPTransformState &State) { 8015 assert(!State.Instance && "Reduction being replicated."); 8016 for (unsigned Part = 0; Part < State.UF; ++Part) { 8017 unsigned Kind = RdxDesc->getRecurrenceKind(); 8018 Value *NewVecOp = State.get(VecOp, Part); 8019 Value *NewRed = 8020 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8021 Value *PrevInChain = State.get(ChainOp, Part); 8022 Value *NextInChain; 8023 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8024 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8025 NextInChain = 8026 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8027 NewRed, PrevInChain); 8028 } else { 8029 NextInChain = State.Builder.CreateBinOp( 8030 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8031 } 8032 State.ValueMap.setVectorValue(I, Part, NextInChain); 8033 } 8034 } 8035 8036 void VPReplicateRecipe::execute(VPTransformState &State) { 8037 if (State.Instance) { // Generate a single instance. 8038 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 8039 IsPredicated, State); 8040 // Insert scalar instance packing it into a vector. 8041 if (AlsoPack && State.VF.isVector()) { 8042 // If we're constructing lane 0, initialize to start from undef. 8043 if (State.Instance->Lane == 0) { 8044 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8045 Value *Undef = 8046 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8047 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8048 } 8049 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8050 } 8051 return; 8052 } 8053 8054 // Generate scalar instances for all VF lanes of all UF parts, unless the 8055 // instruction is uniform inwhich case generate only the first lane for each 8056 // of the UF parts. 8057 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8058 for (unsigned Part = 0; Part < State.UF; ++Part) 8059 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8060 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 8061 IsPredicated, State); 8062 } 8063 8064 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8065 assert(State.Instance && "Branch on Mask works only on single instance."); 8066 8067 unsigned Part = State.Instance->Part; 8068 unsigned Lane = State.Instance->Lane; 8069 8070 Value *ConditionBit = nullptr; 8071 VPValue *BlockInMask = getMask(); 8072 if (BlockInMask) { 8073 ConditionBit = State.get(BlockInMask, Part); 8074 if (ConditionBit->getType()->isVectorTy()) 8075 ConditionBit = State.Builder.CreateExtractElement( 8076 ConditionBit, State.Builder.getInt32(Lane)); 8077 } else // Block in mask is all-one. 8078 ConditionBit = State.Builder.getTrue(); 8079 8080 // Replace the temporary unreachable terminator with a new conditional branch, 8081 // whose two destinations will be set later when they are created. 8082 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8083 assert(isa<UnreachableInst>(CurrentTerminator) && 8084 "Expected to replace unreachable terminator with conditional branch."); 8085 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8086 CondBr->setSuccessor(0, nullptr); 8087 ReplaceInstWithInst(CurrentTerminator, CondBr); 8088 } 8089 8090 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8091 assert(State.Instance && "Predicated instruction PHI works per instance."); 8092 Instruction *ScalarPredInst = cast<Instruction>( 8093 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8094 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8095 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8096 assert(PredicatingBB && "Predicated block has no single predecessor."); 8097 8098 // By current pack/unpack logic we need to generate only a single phi node: if 8099 // a vector value for the predicated instruction exists at this point it means 8100 // the instruction has vector users only, and a phi for the vector value is 8101 // needed. In this case the recipe of the predicated instruction is marked to 8102 // also do that packing, thereby "hoisting" the insert-element sequence. 8103 // Otherwise, a phi node for the scalar value is needed. 8104 unsigned Part = State.Instance->Part; 8105 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8106 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8107 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8108 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8109 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8110 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8111 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8112 } else { 8113 Type *PredInstType = PredInst->getType(); 8114 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8115 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8116 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8117 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8118 } 8119 } 8120 8121 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8122 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8123 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8124 getMask()); 8125 } 8126 8127 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8128 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8129 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8130 // for predication. 8131 static ScalarEpilogueLowering getScalarEpilogueLowering( 8132 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8133 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8134 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8135 LoopVectorizationLegality &LVL) { 8136 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8137 // don't look at hints or options, and don't request a scalar epilogue. 8138 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8139 // LoopAccessInfo (due to code dependency and not being able to reliably get 8140 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8141 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8142 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8143 // back to the old way and vectorize with versioning when forced. See D81345.) 8144 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8145 PGSOQueryType::IRPass) && 8146 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8147 return CM_ScalarEpilogueNotAllowedOptSize; 8148 8149 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8150 !PreferPredicateOverEpilogue; 8151 8152 // 2) Next, if disabling predication is requested on the command line, honour 8153 // this and request a scalar epilogue. 8154 if (PredicateOptDisabled) 8155 return CM_ScalarEpilogueAllowed; 8156 8157 // 3) and 4) look if enabling predication is requested on the command line, 8158 // with a loop hint, or if the TTI hook indicates this is profitable, request 8159 // predication. 8160 if (PreferPredicateOverEpilogue || 8161 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8162 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8163 LVL.getLAI()) && 8164 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8165 return CM_ScalarEpilogueNotNeededUsePredicate; 8166 8167 return CM_ScalarEpilogueAllowed; 8168 } 8169 8170 // Process the loop in the VPlan-native vectorization path. This path builds 8171 // VPlan upfront in the vectorization pipeline, which allows to apply 8172 // VPlan-to-VPlan transformations from the very beginning without modifying the 8173 // input LLVM IR. 8174 static bool processLoopInVPlanNativePath( 8175 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8176 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8177 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8178 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8179 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8180 8181 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8182 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8183 return false; 8184 } 8185 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8186 Function *F = L->getHeader()->getParent(); 8187 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8188 8189 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8190 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8191 8192 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8193 &Hints, IAI); 8194 // Use the planner for outer loop vectorization. 8195 // TODO: CM is not used at this point inside the planner. Turn CM into an 8196 // optional argument if we don't need it in the future. 8197 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8198 8199 // Get user vectorization factor. 8200 const unsigned UserVF = Hints.getWidth(); 8201 8202 // Plan how to best vectorize, return the best VF and its cost. 8203 const VectorizationFactor VF = 8204 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8205 8206 // If we are stress testing VPlan builds, do not attempt to generate vector 8207 // code. Masked vector code generation support will follow soon. 8208 // Also, do not attempt to vectorize if no vector code will be produced. 8209 if (VPlanBuildStressTest || EnableVPlanPredication || 8210 VectorizationFactor::Disabled() == VF) 8211 return false; 8212 8213 LVP.setBestPlan(VF.Width, 1); 8214 8215 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8216 &CM, BFI, PSI); 8217 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8218 << L->getHeader()->getParent()->getName() << "\"\n"); 8219 LVP.executePlan(LB, DT); 8220 8221 // Mark the loop as already vectorized to avoid vectorizing again. 8222 Hints.setAlreadyVectorized(); 8223 8224 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8225 return true; 8226 } 8227 8228 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8229 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8230 !EnableLoopInterleaving), 8231 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8232 !EnableLoopVectorization) {} 8233 8234 bool LoopVectorizePass::processLoop(Loop *L) { 8235 assert((EnableVPlanNativePath || L->empty()) && 8236 "VPlan-native path is not enabled. Only process inner loops."); 8237 8238 #ifndef NDEBUG 8239 const std::string DebugLocStr = getDebugLocString(L); 8240 #endif /* NDEBUG */ 8241 8242 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8243 << L->getHeader()->getParent()->getName() << "\" from " 8244 << DebugLocStr << "\n"); 8245 8246 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8247 8248 LLVM_DEBUG( 8249 dbgs() << "LV: Loop hints:" 8250 << " force=" 8251 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8252 ? "disabled" 8253 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8254 ? "enabled" 8255 : "?")) 8256 << " width=" << Hints.getWidth() 8257 << " unroll=" << Hints.getInterleave() << "\n"); 8258 8259 // Function containing loop 8260 Function *F = L->getHeader()->getParent(); 8261 8262 // Looking at the diagnostic output is the only way to determine if a loop 8263 // was vectorized (other than looking at the IR or machine code), so it 8264 // is important to generate an optimization remark for each loop. Most of 8265 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8266 // generated as OptimizationRemark and OptimizationRemarkMissed are 8267 // less verbose reporting vectorized loops and unvectorized loops that may 8268 // benefit from vectorization, respectively. 8269 8270 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8271 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8272 return false; 8273 } 8274 8275 PredicatedScalarEvolution PSE(*SE, *L); 8276 8277 // Check if it is legal to vectorize the loop. 8278 LoopVectorizationRequirements Requirements(*ORE); 8279 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8280 &Requirements, &Hints, DB, AC, BFI, PSI); 8281 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8282 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8283 Hints.emitRemarkWithHints(); 8284 return false; 8285 } 8286 8287 // Check the function attributes and profiles to find out if this function 8288 // should be optimized for size. 8289 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8290 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8291 8292 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8293 // here. They may require CFG and instruction level transformations before 8294 // even evaluating whether vectorization is profitable. Since we cannot modify 8295 // the incoming IR, we need to build VPlan upfront in the vectorization 8296 // pipeline. 8297 if (!L->empty()) 8298 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8299 ORE, BFI, PSI, Hints); 8300 8301 assert(L->empty() && "Inner loop expected."); 8302 8303 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8304 // count by optimizing for size, to minimize overheads. 8305 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8306 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8307 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8308 << "This loop is worth vectorizing only if no scalar " 8309 << "iteration overheads are incurred."); 8310 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8311 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8312 else { 8313 LLVM_DEBUG(dbgs() << "\n"); 8314 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8315 } 8316 } 8317 8318 // Check the function attributes to see if implicit floats are allowed. 8319 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8320 // an integer loop and the vector instructions selected are purely integer 8321 // vector instructions? 8322 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8323 reportVectorizationFailure( 8324 "Can't vectorize when the NoImplicitFloat attribute is used", 8325 "loop not vectorized due to NoImplicitFloat attribute", 8326 "NoImplicitFloat", ORE, L); 8327 Hints.emitRemarkWithHints(); 8328 return false; 8329 } 8330 8331 // Check if the target supports potentially unsafe FP vectorization. 8332 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8333 // for the target we're vectorizing for, to make sure none of the 8334 // additional fp-math flags can help. 8335 if (Hints.isPotentiallyUnsafe() && 8336 TTI->isFPVectorizationPotentiallyUnsafe()) { 8337 reportVectorizationFailure( 8338 "Potentially unsafe FP op prevents vectorization", 8339 "loop not vectorized due to unsafe FP support.", 8340 "UnsafeFP", ORE, L); 8341 Hints.emitRemarkWithHints(); 8342 return false; 8343 } 8344 8345 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8346 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8347 8348 // If an override option has been passed in for interleaved accesses, use it. 8349 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8350 UseInterleaved = EnableInterleavedMemAccesses; 8351 8352 // Analyze interleaved memory accesses. 8353 if (UseInterleaved) { 8354 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8355 } 8356 8357 // Use the cost model. 8358 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8359 F, &Hints, IAI); 8360 CM.collectValuesToIgnore(); 8361 8362 // Use the planner for vectorization. 8363 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8364 8365 // Get user vectorization factor and interleave count. 8366 unsigned UserVF = Hints.getWidth(); 8367 unsigned UserIC = Hints.getInterleave(); 8368 8369 // Plan how to best vectorize, return the best VF and its cost. 8370 Optional<VectorizationFactor> MaybeVF = 8371 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8372 8373 VectorizationFactor VF = VectorizationFactor::Disabled(); 8374 unsigned IC = 1; 8375 8376 if (MaybeVF) { 8377 VF = *MaybeVF; 8378 // Select the interleave count. 8379 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8380 } 8381 8382 // Identify the diagnostic messages that should be produced. 8383 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8384 bool VectorizeLoop = true, InterleaveLoop = true; 8385 if (Requirements.doesNotMeet(F, L, Hints)) { 8386 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8387 "requirements.\n"); 8388 Hints.emitRemarkWithHints(); 8389 return false; 8390 } 8391 8392 if (VF.Width == 1) { 8393 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8394 VecDiagMsg = std::make_pair( 8395 "VectorizationNotBeneficial", 8396 "the cost-model indicates that vectorization is not beneficial"); 8397 VectorizeLoop = false; 8398 } 8399 8400 if (!MaybeVF && UserIC > 1) { 8401 // Tell the user interleaving was avoided up-front, despite being explicitly 8402 // requested. 8403 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8404 "interleaving should be avoided up front\n"); 8405 IntDiagMsg = std::make_pair( 8406 "InterleavingAvoided", 8407 "Ignoring UserIC, because interleaving was avoided up front"); 8408 InterleaveLoop = false; 8409 } else if (IC == 1 && UserIC <= 1) { 8410 // Tell the user interleaving is not beneficial. 8411 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8412 IntDiagMsg = std::make_pair( 8413 "InterleavingNotBeneficial", 8414 "the cost-model indicates that interleaving is not beneficial"); 8415 InterleaveLoop = false; 8416 if (UserIC == 1) { 8417 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8418 IntDiagMsg.second += 8419 " and is explicitly disabled or interleave count is set to 1"; 8420 } 8421 } else if (IC > 1 && UserIC == 1) { 8422 // Tell the user interleaving is beneficial, but it explicitly disabled. 8423 LLVM_DEBUG( 8424 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8425 IntDiagMsg = std::make_pair( 8426 "InterleavingBeneficialButDisabled", 8427 "the cost-model indicates that interleaving is beneficial " 8428 "but is explicitly disabled or interleave count is set to 1"); 8429 InterleaveLoop = false; 8430 } 8431 8432 // Override IC if user provided an interleave count. 8433 IC = UserIC > 0 ? UserIC : IC; 8434 8435 // Emit diagnostic messages, if any. 8436 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8437 if (!VectorizeLoop && !InterleaveLoop) { 8438 // Do not vectorize or interleaving the loop. 8439 ORE->emit([&]() { 8440 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8441 L->getStartLoc(), L->getHeader()) 8442 << VecDiagMsg.second; 8443 }); 8444 ORE->emit([&]() { 8445 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8446 L->getStartLoc(), L->getHeader()) 8447 << IntDiagMsg.second; 8448 }); 8449 return false; 8450 } else if (!VectorizeLoop && InterleaveLoop) { 8451 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8452 ORE->emit([&]() { 8453 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8454 L->getStartLoc(), L->getHeader()) 8455 << VecDiagMsg.second; 8456 }); 8457 } else if (VectorizeLoop && !InterleaveLoop) { 8458 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8459 << ") in " << DebugLocStr << '\n'); 8460 ORE->emit([&]() { 8461 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8462 L->getStartLoc(), L->getHeader()) 8463 << IntDiagMsg.second; 8464 }); 8465 } else if (VectorizeLoop && InterleaveLoop) { 8466 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8467 << ") in " << DebugLocStr << '\n'); 8468 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8469 } 8470 8471 LVP.setBestPlan(VF.Width, IC); 8472 8473 using namespace ore; 8474 bool DisableRuntimeUnroll = false; 8475 MDNode *OrigLoopID = L->getLoopID(); 8476 8477 if (!VectorizeLoop) { 8478 assert(IC > 1 && "interleave count should not be 1 or 0"); 8479 // If we decided that it is not legal to vectorize the loop, then 8480 // interleave it. 8481 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8482 BFI, PSI); 8483 LVP.executePlan(Unroller, DT); 8484 8485 ORE->emit([&]() { 8486 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8487 L->getHeader()) 8488 << "interleaved loop (interleaved count: " 8489 << NV("InterleaveCount", IC) << ")"; 8490 }); 8491 } else { 8492 // If we decided that it is *legal* to vectorize the loop, then do it. 8493 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8494 &LVL, &CM, BFI, PSI); 8495 LVP.executePlan(LB, DT); 8496 ++LoopsVectorized; 8497 8498 // Add metadata to disable runtime unrolling a scalar loop when there are 8499 // no runtime checks about strides and memory. A scalar loop that is 8500 // rarely used is not worth unrolling. 8501 if (!LB.areSafetyChecksAdded()) 8502 DisableRuntimeUnroll = true; 8503 8504 // Report the vectorization decision. 8505 ORE->emit([&]() { 8506 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8507 L->getHeader()) 8508 << "vectorized loop (vectorization width: " 8509 << NV("VectorizationFactor", VF.Width) 8510 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8511 }); 8512 } 8513 8514 Optional<MDNode *> RemainderLoopID = 8515 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8516 LLVMLoopVectorizeFollowupEpilogue}); 8517 if (RemainderLoopID.hasValue()) { 8518 L->setLoopID(RemainderLoopID.getValue()); 8519 } else { 8520 if (DisableRuntimeUnroll) 8521 AddRuntimeUnrollDisableMetaData(L); 8522 8523 // Mark the loop as already vectorized to avoid vectorizing again. 8524 Hints.setAlreadyVectorized(); 8525 } 8526 8527 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8528 return true; 8529 } 8530 8531 LoopVectorizeResult LoopVectorizePass::runImpl( 8532 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8533 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8534 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8535 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8536 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8537 SE = &SE_; 8538 LI = &LI_; 8539 TTI = &TTI_; 8540 DT = &DT_; 8541 BFI = &BFI_; 8542 TLI = TLI_; 8543 AA = &AA_; 8544 AC = &AC_; 8545 GetLAA = &GetLAA_; 8546 DB = &DB_; 8547 ORE = &ORE_; 8548 PSI = PSI_; 8549 8550 // Don't attempt if 8551 // 1. the target claims to have no vector registers, and 8552 // 2. interleaving won't help ILP. 8553 // 8554 // The second condition is necessary because, even if the target has no 8555 // vector registers, loop vectorization may still enable scalar 8556 // interleaving. 8557 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8558 TTI->getMaxInterleaveFactor(1) < 2) 8559 return LoopVectorizeResult(false, false); 8560 8561 bool Changed = false, CFGChanged = false; 8562 8563 // The vectorizer requires loops to be in simplified form. 8564 // Since simplification may add new inner loops, it has to run before the 8565 // legality and profitability checks. This means running the loop vectorizer 8566 // will simplify all loops, regardless of whether anything end up being 8567 // vectorized. 8568 for (auto &L : *LI) 8569 Changed |= CFGChanged |= 8570 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8571 8572 // Build up a worklist of inner-loops to vectorize. This is necessary as 8573 // the act of vectorizing or partially unrolling a loop creates new loops 8574 // and can invalidate iterators across the loops. 8575 SmallVector<Loop *, 8> Worklist; 8576 8577 for (Loop *L : *LI) 8578 collectSupportedLoops(*L, LI, ORE, Worklist); 8579 8580 LoopsAnalyzed += Worklist.size(); 8581 8582 // Now walk the identified inner loops. 8583 while (!Worklist.empty()) { 8584 Loop *L = Worklist.pop_back_val(); 8585 8586 // For the inner loops we actually process, form LCSSA to simplify the 8587 // transform. 8588 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8589 8590 Changed |= CFGChanged |= processLoop(L); 8591 } 8592 8593 // Process each loop nest in the function. 8594 return LoopVectorizeResult(Changed, CFGChanged); 8595 } 8596 8597 PreservedAnalyses LoopVectorizePass::run(Function &F, 8598 FunctionAnalysisManager &AM) { 8599 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8600 auto &LI = AM.getResult<LoopAnalysis>(F); 8601 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8602 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8603 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8604 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8605 auto &AA = AM.getResult<AAManager>(F); 8606 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8607 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8608 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8609 MemorySSA *MSSA = EnableMSSALoopDependency 8610 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8611 : nullptr; 8612 8613 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8614 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8615 [&](Loop &L) -> const LoopAccessInfo & { 8616 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8617 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8618 }; 8619 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8620 ProfileSummaryInfo *PSI = 8621 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8622 LoopVectorizeResult Result = 8623 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8624 if (!Result.MadeAnyChange) 8625 return PreservedAnalyses::all(); 8626 PreservedAnalyses PA; 8627 8628 // We currently do not preserve loopinfo/dominator analyses with outer loop 8629 // vectorization. Until this is addressed, mark these analyses as preserved 8630 // only for non-VPlan-native path. 8631 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8632 if (!EnableVPlanNativePath) { 8633 PA.preserve<LoopAnalysis>(); 8634 PA.preserve<DominatorTreeAnalysis>(); 8635 } 8636 PA.preserve<BasicAA>(); 8637 PA.preserve<GlobalsAA>(); 8638 if (!Result.MadeCFGChange) 8639 PA.preserveSet<CFGAnalyses>(); 8640 return PA; 8641 } 8642