1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// The number of stores in a loop that are allowed to need predication. 274 static cl::opt<unsigned> NumberOfStoresToPredicate( 275 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 276 cl::desc("Max number of stores to be predicated behind an if.")); 277 278 static cl::opt<bool> EnableIndVarRegisterHeur( 279 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 280 cl::desc("Count the induction variable only once when interleaving")); 281 282 static cl::opt<bool> EnableCondStoresVectorization( 283 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 284 cl::desc("Enable if predication of stores during vectorization.")); 285 286 static cl::opt<unsigned> MaxNestedScalarReductionIC( 287 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 288 cl::desc("The maximum interleave count to use when interleaving a scalar " 289 "reduction in a nested loop.")); 290 291 static cl::opt<bool> 292 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 293 cl::Hidden, 294 cl::desc("Prefer in-loop vector reductions, " 295 "overriding the targets preference.")); 296 297 static cl::opt<bool> PreferPredicatedReductionSelect( 298 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 299 cl::desc( 300 "Prefer predicating a reduction operation over an after loop select.")); 301 302 cl::opt<bool> EnableVPlanNativePath( 303 "enable-vplan-native-path", cl::init(false), cl::Hidden, 304 cl::desc("Enable VPlan-native vectorization path with " 305 "support for outer loop vectorization.")); 306 307 // FIXME: Remove this switch once we have divergence analysis. Currently we 308 // assume divergent non-backedge branches when this switch is true. 309 cl::opt<bool> EnableVPlanPredication( 310 "enable-vplan-predication", cl::init(false), cl::Hidden, 311 cl::desc("Enable VPlan-native vectorization path predicator with " 312 "support for outer loop vectorization.")); 313 314 // This flag enables the stress testing of the VPlan H-CFG construction in the 315 // VPlan-native vectorization path. It must be used in conjuction with 316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 317 // verification of the H-CFGs built. 318 static cl::opt<bool> VPlanBuildStressTest( 319 "vplan-build-stress-test", cl::init(false), cl::Hidden, 320 cl::desc( 321 "Build VPlan for every supported loop nest in the function and bail " 322 "out right after the build (stress test the VPlan H-CFG construction " 323 "in the VPlan-native vectorization path).")); 324 325 cl::opt<bool> llvm::EnableLoopInterleaving( 326 "interleave-loops", cl::init(true), cl::Hidden, 327 cl::desc("Enable loop interleaving in Loop vectorization passes")); 328 cl::opt<bool> llvm::EnableLoopVectorization( 329 "vectorize-loops", cl::init(true), cl::Hidden, 330 cl::desc("Run the Loop vectorization passes")); 331 332 /// A helper function that returns the type of loaded or stored value. 333 static Type *getMemInstValueType(Value *I) { 334 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 335 "Expected Load or Store instruction"); 336 if (auto *LI = dyn_cast<LoadInst>(I)) 337 return LI->getType(); 338 return cast<StoreInst>(I)->getValueOperand()->getType(); 339 } 340 341 /// A helper function that returns true if the given type is irregular. The 342 /// type is irregular if its allocated size doesn't equal the store size of an 343 /// element of the corresponding vector type at the given vectorization factor. 344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 345 assert(!VF.Scalable && "scalable vectors not yet supported."); 346 // Determine if an array of VF elements of type Ty is "bitcast compatible" 347 // with a <VF x Ty> vector. 348 if (VF.isVector()) { 349 auto *VectorTy = VectorType::get(Ty, VF); 350 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 351 } 352 353 // If the vectorization factor is one, we just check if an array of type Ty 354 // requires padding between elements. 355 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 356 } 357 358 /// A helper function that returns the reciprocal of the block probability of 359 /// predicated blocks. If we return X, we are assuming the predicated block 360 /// will execute once for every X iterations of the loop header. 361 /// 362 /// TODO: We should use actual block probability here, if available. Currently, 363 /// we always assume predicated blocks have a 50% chance of executing. 364 static unsigned getReciprocalPredBlockProb() { return 2; } 365 366 /// A helper function that adds a 'fast' flag to floating-point operations. 367 static Value *addFastMathFlag(Value *V) { 368 if (isa<FPMathOperator>(V)) 369 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 370 return V; 371 } 372 373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 374 if (isa<FPMathOperator>(V)) 375 cast<Instruction>(V)->setFastMathFlags(FMF); 376 return V; 377 } 378 379 /// A helper function that returns an integer or floating-point constant with 380 /// value C. 381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 382 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 383 : ConstantFP::get(Ty, C); 384 } 385 386 /// Returns "best known" trip count for the specified loop \p L as defined by 387 /// the following procedure: 388 /// 1) Returns exact trip count if it is known. 389 /// 2) Returns expected trip count according to profile data if any. 390 /// 3) Returns upper bound estimate if it is known. 391 /// 4) Returns None if all of the above failed. 392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 393 // Check if exact trip count is known. 394 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 395 return ExpectedTC; 396 397 // Check if there is an expected trip count available from profile data. 398 if (LoopVectorizeWithBlockFrequency) 399 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 400 return EstimatedTC; 401 402 // Check if upper bound estimate is known. 403 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 404 return ExpectedTC; 405 406 return None; 407 } 408 409 namespace llvm { 410 411 /// InnerLoopVectorizer vectorizes loops which contain only one basic 412 /// block to a specified vectorization factor (VF). 413 /// This class performs the widening of scalars into vectors, or multiple 414 /// scalars. This class also implements the following features: 415 /// * It inserts an epilogue loop for handling loops that don't have iteration 416 /// counts that are known to be a multiple of the vectorization factor. 417 /// * It handles the code generation for reduction variables. 418 /// * Scalarization (implementation using scalars) of un-vectorizable 419 /// instructions. 420 /// InnerLoopVectorizer does not perform any vectorization-legality 421 /// checks, and relies on the caller to check for the different legality 422 /// aspects. The InnerLoopVectorizer relies on the 423 /// LoopVectorizationLegality class to provide information about the induction 424 /// and reduction variables that were found to a given vectorization factor. 425 class InnerLoopVectorizer { 426 public: 427 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 428 LoopInfo *LI, DominatorTree *DT, 429 const TargetLibraryInfo *TLI, 430 const TargetTransformInfo *TTI, AssumptionCache *AC, 431 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 432 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 433 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 434 ProfileSummaryInfo *PSI) 435 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 436 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 437 Builder(PSE.getSE()->getContext()), 438 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 439 BFI(BFI), PSI(PSI) { 440 // Query this against the original loop and save it here because the profile 441 // of the original loop header may change as the transformation happens. 442 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 443 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 444 } 445 446 virtual ~InnerLoopVectorizer() = default; 447 448 /// Create a new empty loop that will contain vectorized instructions later 449 /// on, while the old loop will be used as the scalar remainder. Control flow 450 /// is generated around the vectorized (and scalar epilogue) loops consisting 451 /// of various checks and bypasses. Return the pre-header block of the new 452 /// loop. 453 BasicBlock *createVectorizedLoopSkeleton(); 454 455 /// Widen a single instruction within the innermost loop. 456 void widenInstruction(Instruction &I, VPUser &Operands, 457 VPTransformState &State); 458 459 /// Widen a single call instruction within the innermost loop. 460 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 461 VPTransformState &State); 462 463 /// Widen a single select instruction within the innermost loop. 464 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 465 bool InvariantCond, VPTransformState &State); 466 467 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 468 void fixVectorizedLoop(); 469 470 // Return true if any runtime check is added. 471 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 472 473 /// A type for vectorized values in the new loop. Each value from the 474 /// original loop, when vectorized, is represented by UF vector values in the 475 /// new unrolled loop, where UF is the unroll factor. 476 using VectorParts = SmallVector<Value *, 2>; 477 478 /// Vectorize a single GetElementPtrInst based on information gathered and 479 /// decisions taken during planning. 480 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 481 ElementCount VF, bool IsPtrLoopInvariant, 482 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 483 484 /// Vectorize a single PHINode in a block. This method handles the induction 485 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 486 /// arbitrary length vectors. 487 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 488 489 /// A helper function to scalarize a single Instruction in the innermost loop. 490 /// Generates a sequence of scalar instances for each lane between \p MinLane 491 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 492 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 493 /// Instr's operands. 494 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 495 const VPIteration &Instance, bool IfPredicateInstr, 496 VPTransformState &State); 497 498 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 499 /// is provided, the integer induction variable will first be truncated to 500 /// the corresponding type. 501 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 502 503 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 504 /// vector or scalar value on-demand if one is not yet available. When 505 /// vectorizing a loop, we visit the definition of an instruction before its 506 /// uses. When visiting the definition, we either vectorize or scalarize the 507 /// instruction, creating an entry for it in the corresponding map. (In some 508 /// cases, such as induction variables, we will create both vector and scalar 509 /// entries.) Then, as we encounter uses of the definition, we derive values 510 /// for each scalar or vector use unless such a value is already available. 511 /// For example, if we scalarize a definition and one of its uses is vector, 512 /// we build the required vector on-demand with an insertelement sequence 513 /// when visiting the use. Otherwise, if the use is scalar, we can use the 514 /// existing scalar definition. 515 /// 516 /// Return a value in the new loop corresponding to \p V from the original 517 /// loop at unroll index \p Part. If the value has already been vectorized, 518 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 519 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 520 /// a new vector value on-demand by inserting the scalar values into a vector 521 /// with an insertelement sequence. If the value has been neither vectorized 522 /// nor scalarized, it must be loop invariant, so we simply broadcast the 523 /// value into a vector. 524 Value *getOrCreateVectorValue(Value *V, unsigned Part); 525 526 /// Return a value in the new loop corresponding to \p V from the original 527 /// loop at unroll and vector indices \p Instance. If the value has been 528 /// vectorized but not scalarized, the necessary extractelement instruction 529 /// will be generated. 530 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 531 532 /// Construct the vector value of a scalarized value \p V one lane at a time. 533 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 534 535 /// Try to vectorize interleaved access group \p Group with the base address 536 /// given in \p Addr, optionally masking the vector operations if \p 537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 538 /// values in the vectorized loop. 539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 540 VPTransformState &State, VPValue *Addr, 541 VPValue *BlockInMask = nullptr); 542 543 /// Vectorize Load and Store instructions with the base address given in \p 544 /// Addr, optionally masking the vector operations if \p BlockInMask is 545 /// non-null. Use \p State to translate given VPValues to IR values in the 546 /// vectorized loop. 547 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 548 VPValue *Addr, VPValue *StoredValue, 549 VPValue *BlockInMask); 550 551 /// Set the debug location in the builder using the debug location in 552 /// the instruction. 553 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(void); 557 558 protected: 559 friend class LoopVectorizationPlanner; 560 561 /// A small list of PHINodes. 562 using PhiVector = SmallVector<PHINode *, 4>; 563 564 /// A type for scalarized values in the new loop. Each value from the 565 /// original loop, when scalarized, is represented by UF x VF scalar values 566 /// in the new unrolled loop, where UF is the unroll factor and VF is the 567 /// vectorization factor. 568 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 569 570 /// Set up the values of the IVs correctly when exiting the vector loop. 571 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 572 Value *CountRoundDown, Value *EndValue, 573 BasicBlock *MiddleBlock); 574 575 /// Create a new induction variable inside L. 576 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 577 Value *Step, Instruction *DL); 578 579 /// Handle all cross-iteration phis in the header. 580 void fixCrossIterationPHIs(); 581 582 /// Fix a first-order recurrence. This is the second phase of vectorizing 583 /// this phi node. 584 void fixFirstOrderRecurrence(PHINode *Phi); 585 586 /// Fix a reduction cross-iteration phi. This is the second phase of 587 /// vectorizing this phi node. 588 void fixReduction(PHINode *Phi); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 592 593 /// The Loop exit block may have single value PHI nodes with some 594 /// incoming value. While vectorizing we only handled real values 595 /// that were defined inside the loop and we should have one value for 596 /// each predecessor of its parent basic block. See PR14725. 597 void fixLCSSAPHIs(); 598 599 /// Iteratively sink the scalarized operands of a predicated instruction into 600 /// the block that was created for it. 601 void sinkScalarOperands(Instruction *PredInst); 602 603 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 604 /// represented as. 605 void truncateToMinimalBitwidths(); 606 607 /// Create a broadcast instruction. This method generates a broadcast 608 /// instruction (shuffle) for loop invariant values and for the induction 609 /// value. If this is the induction variable then we extend it to N, N+1, ... 610 /// this is needed because each iteration in the loop corresponds to a SIMD 611 /// element. 612 virtual Value *getBroadcastInstrs(Value *V); 613 614 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID); 628 629 /// Create a vector induction phi node based on an existing scalar one. \p 630 /// EntryVal is the value from the original loop that maps to the vector phi 631 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 632 /// truncate instruction, instead of widening the original IV, we widen a 633 /// version of the IV truncated to \p EntryVal's type. 634 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 635 Value *Step, Instruction *EntryVal); 636 637 /// Returns true if an instruction \p I should be scalarized instead of 638 /// vectorized for the chosen vectorization factor. 639 bool shouldScalarizeInstruction(Instruction *I) const; 640 641 /// Returns true if we should generate a scalar version of \p IV. 642 bool needsScalarInduction(Instruction *IV) const; 643 644 /// If there is a cast involved in the induction variable \p ID, which should 645 /// be ignored in the vectorized loop body, this function records the 646 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 647 /// cast. We had already proved that the casted Phi is equal to the uncasted 648 /// Phi in the vectorized loop (under a runtime guard), and therefore 649 /// there is no need to vectorize the cast - the same value can be used in the 650 /// vector loop for both the Phi and the cast. 651 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 652 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 653 /// 654 /// \p EntryVal is the value from the original loop that maps to the vector 655 /// phi node and is used to distinguish what is the IV currently being 656 /// processed - original one (if \p EntryVal is a phi corresponding to the 657 /// original IV) or the "newly-created" one based on the proof mentioned above 658 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 659 /// latter case \p EntryVal is a TruncInst and we must not record anything for 660 /// that IV, but it's error-prone to expect callers of this routine to care 661 /// about that, hence this explicit parameter. 662 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 663 const Instruction *EntryVal, 664 Value *VectorLoopValue, 665 unsigned Part, 666 unsigned Lane = UINT_MAX); 667 668 /// Generate a shuffle sequence that will reverse the vector Vec. 669 virtual Value *reverseVector(Value *Vec); 670 671 /// Returns (and creates if needed) the original loop trip count. 672 Value *getOrCreateTripCount(Loop *NewLoop); 673 674 /// Returns (and creates if needed) the trip count of the widened loop. 675 Value *getOrCreateVectorTripCount(Loop *NewLoop); 676 677 /// Returns a bitcasted value to the requested vector type. 678 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 679 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 680 const DataLayout &DL); 681 682 /// Emit a bypass check to see if the vector trip count is zero, including if 683 /// it overflows. 684 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 685 686 /// Emit a bypass check to see if all of the SCEV assumptions we've 687 /// had to make are correct. 688 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 689 690 /// Emit bypass checks to check any memory assumptions we may have made. 691 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 692 693 /// Compute the transformed value of Index at offset StartValue using step 694 /// StepValue. 695 /// For integer induction, returns StartValue + Index * StepValue. 696 /// For pointer induction, returns StartValue[Index * StepValue]. 697 /// FIXME: The newly created binary instructions should contain nsw/nuw 698 /// flags, which can be found from the original scalar operations. 699 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 700 const DataLayout &DL, 701 const InductionDescriptor &ID) const; 702 703 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 704 /// vector loop preheader, middle block and scalar preheader. Also 705 /// allocate a loop object for the new vector loop and return it. 706 Loop *createVectorLoopSkeleton(StringRef Prefix); 707 708 /// Create new phi nodes for the induction variables to resume iteration count 709 /// in the scalar epilogue, from where the vectorized loop left off (given by 710 /// \p VectorTripCount). 711 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 712 713 /// Complete the loop skeleton by adding debug MDs, creating appropriate 714 /// conditional branches in the middle block, preparing the builder and 715 /// running the verifier. Take in the vector loop \p L as argument, and return 716 /// the preheader of the completed vector loop. 717 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 718 719 /// Add additional metadata to \p To that was not present on \p Orig. 720 /// 721 /// Currently this is used to add the noalias annotations based on the 722 /// inserted memchecks. Use this for instructions that are *cloned* into the 723 /// vector loop. 724 void addNewMetadata(Instruction *To, const Instruction *Orig); 725 726 /// Add metadata from one instruction to another. 727 /// 728 /// This includes both the original MDs from \p From and additional ones (\see 729 /// addNewMetadata). Use this for *newly created* instructions in the vector 730 /// loop. 731 void addMetadata(Instruction *To, Instruction *From); 732 733 /// Similar to the previous function but it adds the metadata to a 734 /// vector of instructions. 735 void addMetadata(ArrayRef<Value *> To, Instruction *From); 736 737 /// The original loop. 738 Loop *OrigLoop; 739 740 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 741 /// dynamic knowledge to simplify SCEV expressions and converts them to a 742 /// more usable form. 743 PredicatedScalarEvolution &PSE; 744 745 /// Loop Info. 746 LoopInfo *LI; 747 748 /// Dominator Tree. 749 DominatorTree *DT; 750 751 /// Alias Analysis. 752 AAResults *AA; 753 754 /// Target Library Info. 755 const TargetLibraryInfo *TLI; 756 757 /// Target Transform Info. 758 const TargetTransformInfo *TTI; 759 760 /// Assumption Cache. 761 AssumptionCache *AC; 762 763 /// Interface to emit optimization remarks. 764 OptimizationRemarkEmitter *ORE; 765 766 /// LoopVersioning. It's only set up (non-null) if memchecks were 767 /// used. 768 /// 769 /// This is currently only used to add no-alias metadata based on the 770 /// memchecks. The actually versioning is performed manually. 771 std::unique_ptr<LoopVersioning> LVer; 772 773 /// The vectorization SIMD factor to use. Each vector will have this many 774 /// vector elements. 775 ElementCount VF; 776 777 /// The vectorization unroll factor to use. Each scalar is vectorized to this 778 /// many different vector instructions. 779 unsigned UF; 780 781 /// The builder that we use 782 IRBuilder<> Builder; 783 784 // --- Vectorization state --- 785 786 /// The vector-loop preheader. 787 BasicBlock *LoopVectorPreHeader; 788 789 /// The scalar-loop preheader. 790 BasicBlock *LoopScalarPreHeader; 791 792 /// Middle Block between the vector and the scalar. 793 BasicBlock *LoopMiddleBlock; 794 795 /// The ExitBlock of the scalar loop. 796 BasicBlock *LoopExitBlock; 797 798 /// The vector loop body. 799 BasicBlock *LoopVectorBody; 800 801 /// The scalar loop body. 802 BasicBlock *LoopScalarBody; 803 804 /// A list of all bypass blocks. The first block is the entry of the loop. 805 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 806 807 /// The new Induction variable which was added to the new block. 808 PHINode *Induction = nullptr; 809 810 /// The induction variable of the old basic block. 811 PHINode *OldInduction = nullptr; 812 813 /// Maps values from the original loop to their corresponding values in the 814 /// vectorized loop. A key value can map to either vector values, scalar 815 /// values or both kinds of values, depending on whether the key was 816 /// vectorized and scalarized. 817 VectorizerValueMap VectorLoopValueMap; 818 819 /// Store instructions that were predicated. 820 SmallVector<Instruction *, 4> PredicatedInstructions; 821 822 /// Trip count of the original loop. 823 Value *TripCount = nullptr; 824 825 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 826 Value *VectorTripCount = nullptr; 827 828 /// The legality analysis. 829 LoopVectorizationLegality *Legal; 830 831 /// The profitablity analysis. 832 LoopVectorizationCostModel *Cost; 833 834 // Record whether runtime checks are added. 835 bool AddedSafetyChecks = false; 836 837 // Holds the end values for each induction variable. We save the end values 838 // so we can later fix-up the external users of the induction variables. 839 DenseMap<PHINode *, Value *> IVEndValues; 840 841 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 842 // fixed up at the end of vector code generation. 843 SmallVector<PHINode *, 8> OrigPHIsToFix; 844 845 /// BFI and PSI are used to check for profile guided size optimizations. 846 BlockFrequencyInfo *BFI; 847 ProfileSummaryInfo *PSI; 848 849 // Whether this loop should be optimized for size based on profile guided size 850 // optimizatios. 851 bool OptForSizeBasedOnProfile; 852 }; 853 854 class InnerLoopUnroller : public InnerLoopVectorizer { 855 public: 856 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 857 LoopInfo *LI, DominatorTree *DT, 858 const TargetLibraryInfo *TLI, 859 const TargetTransformInfo *TTI, AssumptionCache *AC, 860 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 861 LoopVectorizationLegality *LVL, 862 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 863 ProfileSummaryInfo *PSI) 864 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 865 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 866 BFI, PSI) {} 867 868 private: 869 Value *getBroadcastInstrs(Value *V) override; 870 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 871 Instruction::BinaryOps Opcode = 872 Instruction::BinaryOpsEnd) override; 873 Value *reverseVector(Value *Vec) override; 874 }; 875 876 } // end namespace llvm 877 878 /// Look for a meaningful debug location on the instruction or it's 879 /// operands. 880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 881 if (!I) 882 return I; 883 884 DebugLoc Empty; 885 if (I->getDebugLoc() != Empty) 886 return I; 887 888 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 889 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 890 if (OpInst->getDebugLoc() != Empty) 891 return OpInst; 892 } 893 894 return I; 895 } 896 897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 898 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 899 const DILocation *DIL = Inst->getDebugLoc(); 900 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 901 !isa<DbgInfoIntrinsic>(Inst)) { 902 assert(!VF.Scalable && "scalable vectors not yet supported."); 903 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); 904 if (NewDIL) 905 B.SetCurrentDebugLocation(NewDIL.getValue()); 906 else 907 LLVM_DEBUG(dbgs() 908 << "Failed to create new discriminator: " 909 << DIL->getFilename() << " Line: " << DIL->getLine()); 910 } 911 else 912 B.SetCurrentDebugLocation(DIL); 913 } else 914 B.SetCurrentDebugLocation(DebugLoc()); 915 } 916 917 /// Write a record \p DebugMsg about vectorization failure to the debug 918 /// output stream. If \p I is passed, it is an instruction that prevents 919 /// vectorization. 920 #ifndef NDEBUG 921 static void debugVectorizationFailure(const StringRef DebugMsg, 922 Instruction *I) { 923 dbgs() << "LV: Not vectorizing: " << DebugMsg; 924 if (I != nullptr) 925 dbgs() << " " << *I; 926 else 927 dbgs() << '.'; 928 dbgs() << '\n'; 929 } 930 #endif 931 932 /// Create an analysis remark that explains why vectorization failed 933 /// 934 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 935 /// RemarkName is the identifier for the remark. If \p I is passed it is an 936 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 937 /// the location of the remark. \return the remark object that can be 938 /// streamed to. 939 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 940 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 941 Value *CodeRegion = TheLoop->getHeader(); 942 DebugLoc DL = TheLoop->getStartLoc(); 943 944 if (I) { 945 CodeRegion = I->getParent(); 946 // If there is no debug location attached to the instruction, revert back to 947 // using the loop's. 948 if (I->getDebugLoc()) 949 DL = I->getDebugLoc(); 950 } 951 952 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 953 R << "loop not vectorized: "; 954 return R; 955 } 956 957 namespace llvm { 958 959 void reportVectorizationFailure(const StringRef DebugMsg, 960 const StringRef OREMsg, const StringRef ORETag, 961 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 962 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 963 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 964 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 965 ORETag, TheLoop, I) << OREMsg); 966 } 967 968 } // end namespace llvm 969 970 #ifndef NDEBUG 971 /// \return string containing a file name and a line # for the given loop. 972 static std::string getDebugLocString(const Loop *L) { 973 std::string Result; 974 if (L) { 975 raw_string_ostream OS(Result); 976 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 977 LoopDbgLoc.print(OS); 978 else 979 // Just print the module name. 980 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 981 OS.flush(); 982 } 983 return Result; 984 } 985 #endif 986 987 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 988 const Instruction *Orig) { 989 // If the loop was versioned with memchecks, add the corresponding no-alias 990 // metadata. 991 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 992 LVer->annotateInstWithNoAlias(To, Orig); 993 } 994 995 void InnerLoopVectorizer::addMetadata(Instruction *To, 996 Instruction *From) { 997 propagateMetadata(To, From); 998 addNewMetadata(To, From); 999 } 1000 1001 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1002 Instruction *From) { 1003 for (Value *V : To) { 1004 if (Instruction *I = dyn_cast<Instruction>(V)) 1005 addMetadata(I, From); 1006 } 1007 } 1008 1009 namespace llvm { 1010 1011 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1012 // lowered. 1013 enum ScalarEpilogueLowering { 1014 1015 // The default: allowing scalar epilogues. 1016 CM_ScalarEpilogueAllowed, 1017 1018 // Vectorization with OptForSize: don't allow epilogues. 1019 CM_ScalarEpilogueNotAllowedOptSize, 1020 1021 // A special case of vectorisation with OptForSize: loops with a very small 1022 // trip count are considered for vectorization under OptForSize, thereby 1023 // making sure the cost of their loop body is dominant, free of runtime 1024 // guards and scalar iteration overheads. 1025 CM_ScalarEpilogueNotAllowedLowTripLoop, 1026 1027 // Loop hint predicate indicating an epilogue is undesired. 1028 CM_ScalarEpilogueNotNeededUsePredicate 1029 }; 1030 1031 /// LoopVectorizationCostModel - estimates the expected speedups due to 1032 /// vectorization. 1033 /// In many cases vectorization is not profitable. This can happen because of 1034 /// a number of reasons. In this class we mainly attempt to predict the 1035 /// expected speedup/slowdowns due to the supported instruction set. We use the 1036 /// TargetTransformInfo to query the different backends for the cost of 1037 /// different operations. 1038 class LoopVectorizationCostModel { 1039 public: 1040 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1041 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1042 LoopVectorizationLegality *Legal, 1043 const TargetTransformInfo &TTI, 1044 const TargetLibraryInfo *TLI, DemandedBits *DB, 1045 AssumptionCache *AC, 1046 OptimizationRemarkEmitter *ORE, const Function *F, 1047 const LoopVectorizeHints *Hints, 1048 InterleavedAccessInfo &IAI) 1049 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1050 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1051 Hints(Hints), InterleaveInfo(IAI) {} 1052 1053 /// \return An upper bound for the vectorization factor, or None if 1054 /// vectorization and interleaving should be avoided up front. 1055 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1056 1057 /// \return True if runtime checks are required for vectorization, and false 1058 /// otherwise. 1059 bool runtimeChecksRequired(); 1060 1061 /// \return The most profitable vectorization factor and the cost of that VF. 1062 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1063 /// then this vectorization factor will be selected if vectorization is 1064 /// possible. 1065 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1066 1067 /// Setup cost-based decisions for user vectorization factor. 1068 void selectUserVectorizationFactor(ElementCount UserVF) { 1069 collectUniformsAndScalars(UserVF); 1070 collectInstsToScalarize(UserVF); 1071 } 1072 1073 /// \return The size (in bits) of the smallest and widest types in the code 1074 /// that needs to be vectorized. We ignore values that remain scalar such as 1075 /// 64 bit loop indices. 1076 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1077 1078 /// \return The desired interleave count. 1079 /// If interleave count has been specified by metadata it will be returned. 1080 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1081 /// are the selected vectorization factor and the cost of the selected VF. 1082 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1083 1084 /// Memory access instruction may be vectorized in more than one way. 1085 /// Form of instruction after vectorization depends on cost. 1086 /// This function takes cost-based decisions for Load/Store instructions 1087 /// and collects them in a map. This decisions map is used for building 1088 /// the lists of loop-uniform and loop-scalar instructions. 1089 /// The calculated cost is saved with widening decision in order to 1090 /// avoid redundant calculations. 1091 void setCostBasedWideningDecision(ElementCount VF); 1092 1093 /// A struct that represents some properties of the register usage 1094 /// of a loop. 1095 struct RegisterUsage { 1096 /// Holds the number of loop invariant values that are used in the loop. 1097 /// The key is ClassID of target-provided register class. 1098 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1099 /// Holds the maximum number of concurrent live intervals in the loop. 1100 /// The key is ClassID of target-provided register class. 1101 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1102 }; 1103 1104 /// \return Returns information about the register usages of the loop for the 1105 /// given vectorization factors. 1106 SmallVector<RegisterUsage, 8> 1107 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1108 1109 /// Collect values we want to ignore in the cost model. 1110 void collectValuesToIgnore(); 1111 1112 /// Split reductions into those that happen in the loop, and those that happen 1113 /// outside. In loop reductions are collected into InLoopReductionChains. 1114 void collectInLoopReductions(); 1115 1116 /// \returns The smallest bitwidth each instruction can be represented with. 1117 /// The vector equivalents of these instructions should be truncated to this 1118 /// type. 1119 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1120 return MinBWs; 1121 } 1122 1123 /// \returns True if it is more profitable to scalarize instruction \p I for 1124 /// vectorization factor \p VF. 1125 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1126 assert(VF.isVector() && 1127 "Profitable to scalarize relevant only for VF > 1."); 1128 1129 // Cost model is not run in the VPlan-native path - return conservative 1130 // result until this changes. 1131 if (EnableVPlanNativePath) 1132 return false; 1133 1134 auto Scalars = InstsToScalarize.find(VF); 1135 assert(Scalars != InstsToScalarize.end() && 1136 "VF not yet analyzed for scalarization profitability"); 1137 return Scalars->second.find(I) != Scalars->second.end(); 1138 } 1139 1140 /// Returns true if \p I is known to be uniform after vectorization. 1141 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1142 if (VF.isScalar()) 1143 return true; 1144 1145 // Cost model is not run in the VPlan-native path - return conservative 1146 // result until this changes. 1147 if (EnableVPlanNativePath) 1148 return false; 1149 1150 auto UniformsPerVF = Uniforms.find(VF); 1151 assert(UniformsPerVF != Uniforms.end() && 1152 "VF not yet analyzed for uniformity"); 1153 return UniformsPerVF->second.count(I); 1154 } 1155 1156 /// Returns true if \p I is known to be scalar after vectorization. 1157 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1158 if (VF.isScalar()) 1159 return true; 1160 1161 // Cost model is not run in the VPlan-native path - return conservative 1162 // result until this changes. 1163 if (EnableVPlanNativePath) 1164 return false; 1165 1166 auto ScalarsPerVF = Scalars.find(VF); 1167 assert(ScalarsPerVF != Scalars.end() && 1168 "Scalar values are not calculated for VF"); 1169 return ScalarsPerVF->second.count(I); 1170 } 1171 1172 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1173 /// for vectorization factor \p VF. 1174 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1175 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1176 !isProfitableToScalarize(I, VF) && 1177 !isScalarAfterVectorization(I, VF); 1178 } 1179 1180 /// Decision that was taken during cost calculation for memory instruction. 1181 enum InstWidening { 1182 CM_Unknown, 1183 CM_Widen, // For consecutive accesses with stride +1. 1184 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1185 CM_Interleave, 1186 CM_GatherScatter, 1187 CM_Scalarize 1188 }; 1189 1190 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1191 /// instruction \p I and vector width \p VF. 1192 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1193 unsigned Cost) { 1194 assert(VF.isVector() && "Expected VF >=2"); 1195 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1196 } 1197 1198 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1199 /// interleaving group \p Grp and vector width \p VF. 1200 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1201 ElementCount VF, InstWidening W, unsigned Cost) { 1202 assert(VF.isVector() && "Expected VF >=2"); 1203 /// Broadcast this decicion to all instructions inside the group. 1204 /// But the cost will be assigned to one instruction only. 1205 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1206 if (auto *I = Grp->getMember(i)) { 1207 if (Grp->getInsertPos() == I) 1208 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1209 else 1210 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1211 } 1212 } 1213 } 1214 1215 /// Return the cost model decision for the given instruction \p I and vector 1216 /// width \p VF. Return CM_Unknown if this instruction did not pass 1217 /// through the cost modeling. 1218 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1219 assert(!VF.Scalable && "scalable vectors not yet supported."); 1220 assert(VF.isVector() && "Expected VF >=2"); 1221 1222 // Cost model is not run in the VPlan-native path - return conservative 1223 // result until this changes. 1224 if (EnableVPlanNativePath) 1225 return CM_GatherScatter; 1226 1227 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1228 auto Itr = WideningDecisions.find(InstOnVF); 1229 if (Itr == WideningDecisions.end()) 1230 return CM_Unknown; 1231 return Itr->second.first; 1232 } 1233 1234 /// Return the vectorization cost for the given instruction \p I and vector 1235 /// width \p VF. 1236 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1237 assert(VF.isVector() && "Expected VF >=2"); 1238 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1239 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1240 "The cost is not calculated"); 1241 return WideningDecisions[InstOnVF].second; 1242 } 1243 1244 /// Return True if instruction \p I is an optimizable truncate whose operand 1245 /// is an induction variable. Such a truncate will be removed by adding a new 1246 /// induction variable with the destination type. 1247 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1248 // If the instruction is not a truncate, return false. 1249 auto *Trunc = dyn_cast<TruncInst>(I); 1250 if (!Trunc) 1251 return false; 1252 1253 // Get the source and destination types of the truncate. 1254 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1255 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1256 1257 // If the truncate is free for the given types, return false. Replacing a 1258 // free truncate with an induction variable would add an induction variable 1259 // update instruction to each iteration of the loop. We exclude from this 1260 // check the primary induction variable since it will need an update 1261 // instruction regardless. 1262 Value *Op = Trunc->getOperand(0); 1263 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1264 return false; 1265 1266 // If the truncated value is not an induction variable, return false. 1267 return Legal->isInductionPhi(Op); 1268 } 1269 1270 /// Collects the instructions to scalarize for each predicated instruction in 1271 /// the loop. 1272 void collectInstsToScalarize(ElementCount VF); 1273 1274 /// Collect Uniform and Scalar values for the given \p VF. 1275 /// The sets depend on CM decision for Load/Store instructions 1276 /// that may be vectorized as interleave, gather-scatter or scalarized. 1277 void collectUniformsAndScalars(ElementCount VF) { 1278 // Do the analysis once. 1279 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1280 return; 1281 setCostBasedWideningDecision(VF); 1282 collectLoopUniforms(VF); 1283 collectLoopScalars(VF); 1284 } 1285 1286 /// Returns true if the target machine supports masked store operation 1287 /// for the given \p DataType and kind of access to \p Ptr. 1288 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1289 return Legal->isConsecutivePtr(Ptr) && 1290 TTI.isLegalMaskedStore(DataType, Alignment); 1291 } 1292 1293 /// Returns true if the target machine supports masked load operation 1294 /// for the given \p DataType and kind of access to \p Ptr. 1295 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1296 return Legal->isConsecutivePtr(Ptr) && 1297 TTI.isLegalMaskedLoad(DataType, Alignment); 1298 } 1299 1300 /// Returns true if the target machine supports masked scatter operation 1301 /// for the given \p DataType. 1302 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1303 return TTI.isLegalMaskedScatter(DataType, Alignment); 1304 } 1305 1306 /// Returns true if the target machine supports masked gather operation 1307 /// for the given \p DataType. 1308 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1309 return TTI.isLegalMaskedGather(DataType, Alignment); 1310 } 1311 1312 /// Returns true if the target machine can represent \p V as a masked gather 1313 /// or scatter operation. 1314 bool isLegalGatherOrScatter(Value *V) { 1315 bool LI = isa<LoadInst>(V); 1316 bool SI = isa<StoreInst>(V); 1317 if (!LI && !SI) 1318 return false; 1319 auto *Ty = getMemInstValueType(V); 1320 Align Align = getLoadStoreAlignment(V); 1321 return (LI && isLegalMaskedGather(Ty, Align)) || 1322 (SI && isLegalMaskedScatter(Ty, Align)); 1323 } 1324 1325 /// Returns true if \p I is an instruction that will be scalarized with 1326 /// predication. Such instructions include conditional stores and 1327 /// instructions that may divide by zero. 1328 /// If a non-zero VF has been calculated, we check if I will be scalarized 1329 /// predication for that VF. 1330 bool isScalarWithPredication(Instruction *I, 1331 ElementCount VF = ElementCount::getFixed(1)); 1332 1333 // Returns true if \p I is an instruction that will be predicated either 1334 // through scalar predication or masked load/store or masked gather/scatter. 1335 // Superset of instructions that return true for isScalarWithPredication. 1336 bool isPredicatedInst(Instruction *I) { 1337 if (!blockNeedsPredication(I->getParent())) 1338 return false; 1339 // Loads and stores that need some form of masked operation are predicated 1340 // instructions. 1341 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1342 return Legal->isMaskRequired(I); 1343 return isScalarWithPredication(I); 1344 } 1345 1346 /// Returns true if \p I is a memory instruction with consecutive memory 1347 /// access that can be widened. 1348 bool 1349 memoryInstructionCanBeWidened(Instruction *I, 1350 ElementCount VF = ElementCount::getFixed(1)); 1351 1352 /// Returns true if \p I is a memory instruction in an interleaved-group 1353 /// of memory accesses that can be vectorized with wide vector loads/stores 1354 /// and shuffles. 1355 bool 1356 interleavedAccessCanBeWidened(Instruction *I, 1357 ElementCount VF = ElementCount::getFixed(1)); 1358 1359 /// Check if \p Instr belongs to any interleaved access group. 1360 bool isAccessInterleaved(Instruction *Instr) { 1361 return InterleaveInfo.isInterleaved(Instr); 1362 } 1363 1364 /// Get the interleaved access group that \p Instr belongs to. 1365 const InterleaveGroup<Instruction> * 1366 getInterleavedAccessGroup(Instruction *Instr) { 1367 return InterleaveInfo.getInterleaveGroup(Instr); 1368 } 1369 1370 /// Returns true if an interleaved group requires a scalar iteration 1371 /// to handle accesses with gaps, and there is nothing preventing us from 1372 /// creating a scalar epilogue. 1373 bool requiresScalarEpilogue() const { 1374 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1375 } 1376 1377 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1378 /// loop hint annotation. 1379 bool isScalarEpilogueAllowed() const { 1380 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1381 } 1382 1383 /// Returns true if all loop blocks should be masked to fold tail loop. 1384 bool foldTailByMasking() const { return FoldTailByMasking; } 1385 1386 bool blockNeedsPredication(BasicBlock *BB) { 1387 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1388 } 1389 1390 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1391 /// nodes to the chain of instructions representing the reductions. Uses a 1392 /// MapVector to ensure deterministic iteration order. 1393 using ReductionChainMap = 1394 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1395 1396 /// Return the chain of instructions representing an inloop reduction. 1397 const ReductionChainMap &getInLoopReductionChains() const { 1398 return InLoopReductionChains; 1399 } 1400 1401 /// Returns true if the Phi is part of an inloop reduction. 1402 bool isInLoopReduction(PHINode *Phi) const { 1403 return InLoopReductionChains.count(Phi); 1404 } 1405 1406 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1407 /// with factor VF. Return the cost of the instruction, including 1408 /// scalarization overhead if it's needed. 1409 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1410 1411 /// Estimate cost of a call instruction CI if it were vectorized with factor 1412 /// VF. Return the cost of the instruction, including scalarization overhead 1413 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1414 /// scalarized - 1415 /// i.e. either vector version isn't available, or is too expensive. 1416 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1417 bool &NeedToScalarize); 1418 1419 /// Invalidates decisions already taken by the cost model. 1420 void invalidateCostModelingDecisions() { 1421 WideningDecisions.clear(); 1422 Uniforms.clear(); 1423 Scalars.clear(); 1424 } 1425 1426 private: 1427 unsigned NumPredStores = 0; 1428 1429 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1430 /// than zero. One is returned if vectorization should best be avoided due 1431 /// to cost. 1432 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1433 1434 /// The vectorization cost is a combination of the cost itself and a boolean 1435 /// indicating whether any of the contributing operations will actually 1436 /// operate on 1437 /// vector values after type legalization in the backend. If this latter value 1438 /// is 1439 /// false, then all operations will be scalarized (i.e. no vectorization has 1440 /// actually taken place). 1441 using VectorizationCostTy = std::pair<unsigned, bool>; 1442 1443 /// Returns the expected execution cost. The unit of the cost does 1444 /// not matter because we use the 'cost' units to compare different 1445 /// vector widths. The cost that is returned is *not* normalized by 1446 /// the factor width. 1447 VectorizationCostTy expectedCost(ElementCount VF); 1448 1449 /// Returns the execution time cost of an instruction for a given vector 1450 /// width. Vector width of one means scalar. 1451 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1452 1453 /// The cost-computation logic from getInstructionCost which provides 1454 /// the vector type as an output parameter. 1455 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1456 1457 /// Calculate vectorization cost of memory instruction \p I. 1458 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1459 1460 /// The cost computation for scalarized memory instruction. 1461 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1462 1463 /// The cost computation for interleaving group of memory instructions. 1464 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1465 1466 /// The cost computation for Gather/Scatter instruction. 1467 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1468 1469 /// The cost computation for widening instruction \p I with consecutive 1470 /// memory access. 1471 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1472 1473 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1474 /// Load: scalar load + broadcast. 1475 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1476 /// element) 1477 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1478 1479 /// Estimate the overhead of scalarizing an instruction. This is a 1480 /// convenience wrapper for the type-based getScalarizationOverhead API. 1481 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1482 1483 /// Returns whether the instruction is a load or store and will be a emitted 1484 /// as a vector operation. 1485 bool isConsecutiveLoadOrStore(Instruction *I); 1486 1487 /// Returns true if an artificially high cost for emulated masked memrefs 1488 /// should be used. 1489 bool useEmulatedMaskMemRefHack(Instruction *I); 1490 1491 /// Map of scalar integer values to the smallest bitwidth they can be legally 1492 /// represented as. The vector equivalents of these values should be truncated 1493 /// to this type. 1494 MapVector<Instruction *, uint64_t> MinBWs; 1495 1496 /// A type representing the costs for instructions if they were to be 1497 /// scalarized rather than vectorized. The entries are Instruction-Cost 1498 /// pairs. 1499 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1500 1501 /// A set containing all BasicBlocks that are known to present after 1502 /// vectorization as a predicated block. 1503 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1504 1505 /// Records whether it is allowed to have the original scalar loop execute at 1506 /// least once. This may be needed as a fallback loop in case runtime 1507 /// aliasing/dependence checks fail, or to handle the tail/remainder 1508 /// iterations when the trip count is unknown or doesn't divide by the VF, 1509 /// or as a peel-loop to handle gaps in interleave-groups. 1510 /// Under optsize and when the trip count is very small we don't allow any 1511 /// iterations to execute in the scalar loop. 1512 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1513 1514 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1515 bool FoldTailByMasking = false; 1516 1517 /// A map holding scalar costs for different vectorization factors. The 1518 /// presence of a cost for an instruction in the mapping indicates that the 1519 /// instruction will be scalarized when vectorizing with the associated 1520 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1521 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1522 1523 /// Holds the instructions known to be uniform after vectorization. 1524 /// The data is collected per VF. 1525 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1526 1527 /// Holds the instructions known to be scalar after vectorization. 1528 /// The data is collected per VF. 1529 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1530 1531 /// Holds the instructions (address computations) that are forced to be 1532 /// scalarized. 1533 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1534 1535 /// PHINodes of the reductions that should be expanded in-loop along with 1536 /// their associated chains of reduction operations, in program order from top 1537 /// (PHI) to bottom 1538 ReductionChainMap InLoopReductionChains; 1539 1540 /// Returns the expected difference in cost from scalarizing the expression 1541 /// feeding a predicated instruction \p PredInst. The instructions to 1542 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1543 /// non-negative return value implies the expression will be scalarized. 1544 /// Currently, only single-use chains are considered for scalarization. 1545 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1546 ElementCount VF); 1547 1548 /// Collect the instructions that are uniform after vectorization. An 1549 /// instruction is uniform if we represent it with a single scalar value in 1550 /// the vectorized loop corresponding to each vector iteration. Examples of 1551 /// uniform instructions include pointer operands of consecutive or 1552 /// interleaved memory accesses. Note that although uniformity implies an 1553 /// instruction will be scalar, the reverse is not true. In general, a 1554 /// scalarized instruction will be represented by VF scalar values in the 1555 /// vectorized loop, each corresponding to an iteration of the original 1556 /// scalar loop. 1557 void collectLoopUniforms(ElementCount VF); 1558 1559 /// Collect the instructions that are scalar after vectorization. An 1560 /// instruction is scalar if it is known to be uniform or will be scalarized 1561 /// during vectorization. Non-uniform scalarized instructions will be 1562 /// represented by VF values in the vectorized loop, each corresponding to an 1563 /// iteration of the original scalar loop. 1564 void collectLoopScalars(ElementCount VF); 1565 1566 /// Keeps cost model vectorization decision and cost for instructions. 1567 /// Right now it is used for memory instructions only. 1568 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1569 std::pair<InstWidening, unsigned>>; 1570 1571 DecisionList WideningDecisions; 1572 1573 /// Returns true if \p V is expected to be vectorized and it needs to be 1574 /// extracted. 1575 bool needsExtract(Value *V, ElementCount VF) const { 1576 Instruction *I = dyn_cast<Instruction>(V); 1577 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1578 TheLoop->isLoopInvariant(I)) 1579 return false; 1580 1581 // Assume we can vectorize V (and hence we need extraction) if the 1582 // scalars are not computed yet. This can happen, because it is called 1583 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1584 // the scalars are collected. That should be a safe assumption in most 1585 // cases, because we check if the operands have vectorizable types 1586 // beforehand in LoopVectorizationLegality. 1587 return Scalars.find(VF) == Scalars.end() || 1588 !isScalarAfterVectorization(I, VF); 1589 }; 1590 1591 /// Returns a range containing only operands needing to be extracted. 1592 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1593 ElementCount VF) { 1594 return SmallVector<Value *, 4>(make_filter_range( 1595 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1596 } 1597 1598 public: 1599 /// The loop that we evaluate. 1600 Loop *TheLoop; 1601 1602 /// Predicated scalar evolution analysis. 1603 PredicatedScalarEvolution &PSE; 1604 1605 /// Loop Info analysis. 1606 LoopInfo *LI; 1607 1608 /// Vectorization legality. 1609 LoopVectorizationLegality *Legal; 1610 1611 /// Vector target information. 1612 const TargetTransformInfo &TTI; 1613 1614 /// Target Library Info. 1615 const TargetLibraryInfo *TLI; 1616 1617 /// Demanded bits analysis. 1618 DemandedBits *DB; 1619 1620 /// Assumption cache. 1621 AssumptionCache *AC; 1622 1623 /// Interface to emit optimization remarks. 1624 OptimizationRemarkEmitter *ORE; 1625 1626 const Function *TheFunction; 1627 1628 /// Loop Vectorize Hint. 1629 const LoopVectorizeHints *Hints; 1630 1631 /// The interleave access information contains groups of interleaved accesses 1632 /// with the same stride and close to each other. 1633 InterleavedAccessInfo &InterleaveInfo; 1634 1635 /// Values to ignore in the cost model. 1636 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1637 1638 /// Values to ignore in the cost model when VF > 1. 1639 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1640 }; 1641 1642 } // end namespace llvm 1643 1644 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1645 // vectorization. The loop needs to be annotated with #pragma omp simd 1646 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1647 // vector length information is not provided, vectorization is not considered 1648 // explicit. Interleave hints are not allowed either. These limitations will be 1649 // relaxed in the future. 1650 // Please, note that we are currently forced to abuse the pragma 'clang 1651 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1652 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1653 // provides *explicit vectorization hints* (LV can bypass legal checks and 1654 // assume that vectorization is legal). However, both hints are implemented 1655 // using the same metadata (llvm.loop.vectorize, processed by 1656 // LoopVectorizeHints). This will be fixed in the future when the native IR 1657 // representation for pragma 'omp simd' is introduced. 1658 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1659 OptimizationRemarkEmitter *ORE) { 1660 assert(!OuterLp->empty() && "This is not an outer loop"); 1661 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1662 1663 // Only outer loops with an explicit vectorization hint are supported. 1664 // Unannotated outer loops are ignored. 1665 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1666 return false; 1667 1668 Function *Fn = OuterLp->getHeader()->getParent(); 1669 if (!Hints.allowVectorization(Fn, OuterLp, 1670 true /*VectorizeOnlyWhenForced*/)) { 1671 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1672 return false; 1673 } 1674 1675 if (Hints.getInterleave() > 1) { 1676 // TODO: Interleave support is future work. 1677 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1678 "outer loops.\n"); 1679 Hints.emitRemarkWithHints(); 1680 return false; 1681 } 1682 1683 return true; 1684 } 1685 1686 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1687 OptimizationRemarkEmitter *ORE, 1688 SmallVectorImpl<Loop *> &V) { 1689 // Collect inner loops and outer loops without irreducible control flow. For 1690 // now, only collect outer loops that have explicit vectorization hints. If we 1691 // are stress testing the VPlan H-CFG construction, we collect the outermost 1692 // loop of every loop nest. 1693 if (L.empty() || VPlanBuildStressTest || 1694 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1695 LoopBlocksRPO RPOT(&L); 1696 RPOT.perform(LI); 1697 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1698 V.push_back(&L); 1699 // TODO: Collect inner loops inside marked outer loops in case 1700 // vectorization fails for the outer loop. Do not invoke 1701 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1702 // already known to be reducible. We can use an inherited attribute for 1703 // that. 1704 return; 1705 } 1706 } 1707 for (Loop *InnerL : L) 1708 collectSupportedLoops(*InnerL, LI, ORE, V); 1709 } 1710 1711 namespace { 1712 1713 /// The LoopVectorize Pass. 1714 struct LoopVectorize : public FunctionPass { 1715 /// Pass identification, replacement for typeid 1716 static char ID; 1717 1718 LoopVectorizePass Impl; 1719 1720 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1721 bool VectorizeOnlyWhenForced = false) 1722 : FunctionPass(ID), 1723 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1724 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1725 } 1726 1727 bool runOnFunction(Function &F) override { 1728 if (skipFunction(F)) 1729 return false; 1730 1731 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1732 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1733 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1734 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1735 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1736 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1737 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1738 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1739 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1740 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1741 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1742 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1743 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1744 1745 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1746 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1747 1748 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1749 GetLAA, *ORE, PSI).MadeAnyChange; 1750 } 1751 1752 void getAnalysisUsage(AnalysisUsage &AU) const override { 1753 AU.addRequired<AssumptionCacheTracker>(); 1754 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1755 AU.addRequired<DominatorTreeWrapperPass>(); 1756 AU.addRequired<LoopInfoWrapperPass>(); 1757 AU.addRequired<ScalarEvolutionWrapperPass>(); 1758 AU.addRequired<TargetTransformInfoWrapperPass>(); 1759 AU.addRequired<AAResultsWrapperPass>(); 1760 AU.addRequired<LoopAccessLegacyAnalysis>(); 1761 AU.addRequired<DemandedBitsWrapperPass>(); 1762 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1763 AU.addRequired<InjectTLIMappingsLegacy>(); 1764 1765 // We currently do not preserve loopinfo/dominator analyses with outer loop 1766 // vectorization. Until this is addressed, mark these analyses as preserved 1767 // only for non-VPlan-native path. 1768 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1769 if (!EnableVPlanNativePath) { 1770 AU.addPreserved<LoopInfoWrapperPass>(); 1771 AU.addPreserved<DominatorTreeWrapperPass>(); 1772 } 1773 1774 AU.addPreserved<BasicAAWrapperPass>(); 1775 AU.addPreserved<GlobalsAAWrapperPass>(); 1776 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1777 } 1778 }; 1779 1780 } // end anonymous namespace 1781 1782 //===----------------------------------------------------------------------===// 1783 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1784 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1785 //===----------------------------------------------------------------------===// 1786 1787 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1788 // We need to place the broadcast of invariant variables outside the loop, 1789 // but only if it's proven safe to do so. Else, broadcast will be inside 1790 // vector loop body. 1791 Instruction *Instr = dyn_cast<Instruction>(V); 1792 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1793 (!Instr || 1794 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1795 // Place the code for broadcasting invariant variables in the new preheader. 1796 IRBuilder<>::InsertPointGuard Guard(Builder); 1797 if (SafeToHoist) 1798 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1799 1800 // Broadcast the scalar into all locations in the vector. 1801 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1802 1803 return Shuf; 1804 } 1805 1806 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1807 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1808 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1809 "Expected either an induction phi-node or a truncate of it!"); 1810 Value *Start = II.getStartValue(); 1811 1812 // Construct the initial value of the vector IV in the vector loop preheader 1813 auto CurrIP = Builder.saveIP(); 1814 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1815 if (isa<TruncInst>(EntryVal)) { 1816 assert(Start->getType()->isIntegerTy() && 1817 "Truncation requires an integer type"); 1818 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1819 Step = Builder.CreateTrunc(Step, TruncType); 1820 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1821 } 1822 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1823 Value *SteppedStart = 1824 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1825 1826 // We create vector phi nodes for both integer and floating-point induction 1827 // variables. Here, we determine the kind of arithmetic we will perform. 1828 Instruction::BinaryOps AddOp; 1829 Instruction::BinaryOps MulOp; 1830 if (Step->getType()->isIntegerTy()) { 1831 AddOp = Instruction::Add; 1832 MulOp = Instruction::Mul; 1833 } else { 1834 AddOp = II.getInductionOpcode(); 1835 MulOp = Instruction::FMul; 1836 } 1837 1838 // Multiply the vectorization factor by the step using integer or 1839 // floating-point arithmetic as appropriate. 1840 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); 1841 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1842 1843 // Create a vector splat to use in the induction update. 1844 // 1845 // FIXME: If the step is non-constant, we create the vector splat with 1846 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1847 // handle a constant vector splat. 1848 assert(!VF.Scalable && "scalable vectors not yet supported."); 1849 Value *SplatVF = isa<Constant>(Mul) 1850 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1851 : Builder.CreateVectorSplat(VF, Mul); 1852 Builder.restoreIP(CurrIP); 1853 1854 // We may need to add the step a number of times, depending on the unroll 1855 // factor. The last of those goes into the PHI. 1856 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1857 &*LoopVectorBody->getFirstInsertionPt()); 1858 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1859 Instruction *LastInduction = VecInd; 1860 for (unsigned Part = 0; Part < UF; ++Part) { 1861 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1862 1863 if (isa<TruncInst>(EntryVal)) 1864 addMetadata(LastInduction, EntryVal); 1865 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1866 1867 LastInduction = cast<Instruction>(addFastMathFlag( 1868 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1869 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1870 } 1871 1872 // Move the last step to the end of the latch block. This ensures consistent 1873 // placement of all induction updates. 1874 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1875 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1876 auto *ICmp = cast<Instruction>(Br->getCondition()); 1877 LastInduction->moveBefore(ICmp); 1878 LastInduction->setName("vec.ind.next"); 1879 1880 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1881 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1882 } 1883 1884 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1885 return Cost->isScalarAfterVectorization(I, VF) || 1886 Cost->isProfitableToScalarize(I, VF); 1887 } 1888 1889 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1890 if (shouldScalarizeInstruction(IV)) 1891 return true; 1892 auto isScalarInst = [&](User *U) -> bool { 1893 auto *I = cast<Instruction>(U); 1894 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1895 }; 1896 return llvm::any_of(IV->users(), isScalarInst); 1897 } 1898 1899 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1900 const InductionDescriptor &ID, const Instruction *EntryVal, 1901 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1902 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1903 "Expected either an induction phi-node or a truncate of it!"); 1904 1905 // This induction variable is not the phi from the original loop but the 1906 // newly-created IV based on the proof that casted Phi is equal to the 1907 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1908 // re-uses the same InductionDescriptor that original IV uses but we don't 1909 // have to do any recording in this case - that is done when original IV is 1910 // processed. 1911 if (isa<TruncInst>(EntryVal)) 1912 return; 1913 1914 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1915 if (Casts.empty()) 1916 return; 1917 // Only the first Cast instruction in the Casts vector is of interest. 1918 // The rest of the Casts (if exist) have no uses outside the 1919 // induction update chain itself. 1920 Instruction *CastInst = *Casts.begin(); 1921 if (Lane < UINT_MAX) 1922 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1923 else 1924 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1925 } 1926 1927 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1928 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1929 "Primary induction variable must have an integer type"); 1930 1931 auto II = Legal->getInductionVars().find(IV); 1932 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1933 1934 auto ID = II->second; 1935 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1936 1937 // The value from the original loop to which we are mapping the new induction 1938 // variable. 1939 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1940 1941 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1942 1943 // Generate code for the induction step. Note that induction steps are 1944 // required to be loop-invariant 1945 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1946 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1947 "Induction step should be loop invariant"); 1948 if (PSE.getSE()->isSCEVable(IV->getType())) { 1949 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1950 return Exp.expandCodeFor(Step, Step->getType(), 1951 LoopVectorPreHeader->getTerminator()); 1952 } 1953 return cast<SCEVUnknown>(Step)->getValue(); 1954 }; 1955 1956 // The scalar value to broadcast. This is derived from the canonical 1957 // induction variable. If a truncation type is given, truncate the canonical 1958 // induction variable and step. Otherwise, derive these values from the 1959 // induction descriptor. 1960 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1961 Value *ScalarIV = Induction; 1962 if (IV != OldInduction) { 1963 ScalarIV = IV->getType()->isIntegerTy() 1964 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1965 : Builder.CreateCast(Instruction::SIToFP, Induction, 1966 IV->getType()); 1967 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1968 ScalarIV->setName("offset.idx"); 1969 } 1970 if (Trunc) { 1971 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1972 assert(Step->getType()->isIntegerTy() && 1973 "Truncation requires an integer step"); 1974 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1975 Step = Builder.CreateTrunc(Step, TruncType); 1976 } 1977 return ScalarIV; 1978 }; 1979 1980 // Create the vector values from the scalar IV, in the absence of creating a 1981 // vector IV. 1982 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1983 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1984 for (unsigned Part = 0; Part < UF; ++Part) { 1985 assert(!VF.Scalable && "scalable vectors not yet supported."); 1986 Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, 1987 ID.getInductionOpcode()); 1988 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1989 if (Trunc) 1990 addMetadata(EntryPart, Trunc); 1991 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1992 } 1993 }; 1994 1995 // Now do the actual transformations, and start with creating the step value. 1996 Value *Step = CreateStepValue(ID.getStep()); 1997 if (VF.isZero() || VF.isScalar()) { 1998 Value *ScalarIV = CreateScalarIV(Step); 1999 CreateSplatIV(ScalarIV, Step); 2000 return; 2001 } 2002 2003 // Determine if we want a scalar version of the induction variable. This is 2004 // true if the induction variable itself is not widened, or if it has at 2005 // least one user in the loop that is not widened. 2006 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2007 if (!NeedsScalarIV) { 2008 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2009 return; 2010 } 2011 2012 // Try to create a new independent vector induction variable. If we can't 2013 // create the phi node, we will splat the scalar induction variable in each 2014 // loop iteration. 2015 if (!shouldScalarizeInstruction(EntryVal)) { 2016 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2017 Value *ScalarIV = CreateScalarIV(Step); 2018 // Create scalar steps that can be used by instructions we will later 2019 // scalarize. Note that the addition of the scalar steps will not increase 2020 // the number of instructions in the loop in the common case prior to 2021 // InstCombine. We will be trading one vector extract for each scalar step. 2022 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2023 return; 2024 } 2025 2026 // All IV users are scalar instructions, so only emit a scalar IV, not a 2027 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2028 // predicate used by the masked loads/stores. 2029 Value *ScalarIV = CreateScalarIV(Step); 2030 if (!Cost->isScalarEpilogueAllowed()) 2031 CreateSplatIV(ScalarIV, Step); 2032 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2033 } 2034 2035 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2036 Instruction::BinaryOps BinOp) { 2037 // Create and check the types. 2038 auto *ValVTy = cast<VectorType>(Val->getType()); 2039 int VLen = ValVTy->getNumElements(); 2040 2041 Type *STy = Val->getType()->getScalarType(); 2042 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2043 "Induction Step must be an integer or FP"); 2044 assert(Step->getType() == STy && "Step has wrong type"); 2045 2046 SmallVector<Constant *, 8> Indices; 2047 2048 if (STy->isIntegerTy()) { 2049 // Create a vector of consecutive numbers from zero to VF. 2050 for (int i = 0; i < VLen; ++i) 2051 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2052 2053 // Add the consecutive indices to the vector value. 2054 Constant *Cv = ConstantVector::get(Indices); 2055 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2056 Step = Builder.CreateVectorSplat(VLen, Step); 2057 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2058 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2059 // which can be found from the original scalar operations. 2060 Step = Builder.CreateMul(Cv, Step); 2061 return Builder.CreateAdd(Val, Step, "induction"); 2062 } 2063 2064 // Floating point induction. 2065 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2066 "Binary Opcode should be specified for FP induction"); 2067 // Create a vector of consecutive numbers from zero to VF. 2068 for (int i = 0; i < VLen; ++i) 2069 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2070 2071 // Add the consecutive indices to the vector value. 2072 Constant *Cv = ConstantVector::get(Indices); 2073 2074 Step = Builder.CreateVectorSplat(VLen, Step); 2075 2076 // Floating point operations had to be 'fast' to enable the induction. 2077 FastMathFlags Flags; 2078 Flags.setFast(); 2079 2080 Value *MulOp = Builder.CreateFMul(Cv, Step); 2081 if (isa<Instruction>(MulOp)) 2082 // Have to check, MulOp may be a constant 2083 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2084 2085 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2086 if (isa<Instruction>(BOp)) 2087 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2088 return BOp; 2089 } 2090 2091 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2092 Instruction *EntryVal, 2093 const InductionDescriptor &ID) { 2094 // We shouldn't have to build scalar steps if we aren't vectorizing. 2095 assert(VF.isVector() && "VF should be greater than one"); 2096 assert(!VF.Scalable && 2097 "the code below assumes a fixed number of elements at compile time"); 2098 // Get the value type and ensure it and the step have the same integer type. 2099 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2100 assert(ScalarIVTy == Step->getType() && 2101 "Val and Step should have the same type"); 2102 2103 // We build scalar steps for both integer and floating-point induction 2104 // variables. Here, we determine the kind of arithmetic we will perform. 2105 Instruction::BinaryOps AddOp; 2106 Instruction::BinaryOps MulOp; 2107 if (ScalarIVTy->isIntegerTy()) { 2108 AddOp = Instruction::Add; 2109 MulOp = Instruction::Mul; 2110 } else { 2111 AddOp = ID.getInductionOpcode(); 2112 MulOp = Instruction::FMul; 2113 } 2114 2115 // Determine the number of scalars we need to generate for each unroll 2116 // iteration. If EntryVal is uniform, we only need to generate the first 2117 // lane. Otherwise, we generate all VF values. 2118 unsigned Lanes = 2119 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2120 ? 1 2121 : VF.Min; 2122 // Compute the scalar steps and save the results in VectorLoopValueMap. 2123 for (unsigned Part = 0; Part < UF; ++Part) { 2124 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2125 auto *StartIdx = 2126 getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); 2127 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2128 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2129 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2130 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2131 } 2132 } 2133 } 2134 2135 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2136 assert(V != Induction && "The new induction variable should not be used."); 2137 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2138 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2139 2140 // If we have a stride that is replaced by one, do it here. Defer this for 2141 // the VPlan-native path until we start running Legal checks in that path. 2142 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2143 V = ConstantInt::get(V->getType(), 1); 2144 2145 // If we have a vector mapped to this value, return it. 2146 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2147 return VectorLoopValueMap.getVectorValue(V, Part); 2148 2149 // If the value has not been vectorized, check if it has been scalarized 2150 // instead. If it has been scalarized, and we actually need the value in 2151 // vector form, we will construct the vector values on demand. 2152 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2153 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2154 2155 // If we've scalarized a value, that value should be an instruction. 2156 auto *I = cast<Instruction>(V); 2157 2158 // If we aren't vectorizing, we can just copy the scalar map values over to 2159 // the vector map. 2160 if (VF == 1) { 2161 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2162 return ScalarValue; 2163 } 2164 2165 // Get the last scalar instruction we generated for V and Part. If the value 2166 // is known to be uniform after vectorization, this corresponds to lane zero 2167 // of the Part unroll iteration. Otherwise, the last instruction is the one 2168 // we created for the last vector lane of the Part unroll iteration. 2169 assert(!VF.Scalable && "scalable vectors not yet supported."); 2170 unsigned LastLane = 2171 Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; 2172 auto *LastInst = cast<Instruction>( 2173 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2174 2175 // Set the insert point after the last scalarized instruction. This ensures 2176 // the insertelement sequence will directly follow the scalar definitions. 2177 auto OldIP = Builder.saveIP(); 2178 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2179 Builder.SetInsertPoint(&*NewIP); 2180 2181 // However, if we are vectorizing, we need to construct the vector values. 2182 // If the value is known to be uniform after vectorization, we can just 2183 // broadcast the scalar value corresponding to lane zero for each unroll 2184 // iteration. Otherwise, we construct the vector values using insertelement 2185 // instructions. Since the resulting vectors are stored in 2186 // VectorLoopValueMap, we will only generate the insertelements once. 2187 Value *VectorValue = nullptr; 2188 if (Cost->isUniformAfterVectorization(I, VF)) { 2189 VectorValue = getBroadcastInstrs(ScalarValue); 2190 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2191 } else { 2192 // Initialize packing with insertelements to start from undef. 2193 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2194 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2195 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2196 for (unsigned Lane = 0; Lane < VF.Min; ++Lane) 2197 packScalarIntoVectorValue(V, {Part, Lane}); 2198 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2199 } 2200 Builder.restoreIP(OldIP); 2201 return VectorValue; 2202 } 2203 2204 // If this scalar is unknown, assume that it is a constant or that it is 2205 // loop invariant. Broadcast V and save the value for future uses. 2206 Value *B = getBroadcastInstrs(V); 2207 VectorLoopValueMap.setVectorValue(V, Part, B); 2208 return B; 2209 } 2210 2211 Value * 2212 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2213 const VPIteration &Instance) { 2214 // If the value is not an instruction contained in the loop, it should 2215 // already be scalar. 2216 if (OrigLoop->isLoopInvariant(V)) 2217 return V; 2218 2219 assert(Instance.Lane > 0 2220 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2221 : true && "Uniform values only have lane zero"); 2222 2223 // If the value from the original loop has not been vectorized, it is 2224 // represented by UF x VF scalar values in the new loop. Return the requested 2225 // scalar value. 2226 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2227 return VectorLoopValueMap.getScalarValue(V, Instance); 2228 2229 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2230 // for the given unroll part. If this entry is not a vector type (i.e., the 2231 // vectorization factor is one), there is no need to generate an 2232 // extractelement instruction. 2233 auto *U = getOrCreateVectorValue(V, Instance.Part); 2234 if (!U->getType()->isVectorTy()) { 2235 assert(VF == 1 && "Value not scalarized has non-vector type"); 2236 return U; 2237 } 2238 2239 // Otherwise, the value from the original loop has been vectorized and is 2240 // represented by UF vector values. Extract and return the requested scalar 2241 // value from the appropriate vector lane. 2242 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2243 } 2244 2245 void InnerLoopVectorizer::packScalarIntoVectorValue( 2246 Value *V, const VPIteration &Instance) { 2247 assert(V != Induction && "The new induction variable should not be used."); 2248 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2249 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2250 2251 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2252 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2253 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2254 Builder.getInt32(Instance.Lane)); 2255 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2256 } 2257 2258 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2259 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2260 assert(!VF.Scalable && "Cannot reverse scalable vectors"); 2261 SmallVector<int, 8> ShuffleMask; 2262 for (unsigned i = 0; i < VF.Min; ++i) 2263 ShuffleMask.push_back(VF.Min - i - 1); 2264 2265 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2266 ShuffleMask, "reverse"); 2267 } 2268 2269 // Return whether we allow using masked interleave-groups (for dealing with 2270 // strided loads/stores that reside in predicated blocks, or for dealing 2271 // with gaps). 2272 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2273 // If an override option has been passed in for interleaved accesses, use it. 2274 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2275 return EnableMaskedInterleavedMemAccesses; 2276 2277 return TTI.enableMaskedInterleavedAccessVectorization(); 2278 } 2279 2280 // Try to vectorize the interleave group that \p Instr belongs to. 2281 // 2282 // E.g. Translate following interleaved load group (factor = 3): 2283 // for (i = 0; i < N; i+=3) { 2284 // R = Pic[i]; // Member of index 0 2285 // G = Pic[i+1]; // Member of index 1 2286 // B = Pic[i+2]; // Member of index 2 2287 // ... // do something to R, G, B 2288 // } 2289 // To: 2290 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2291 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2292 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2293 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2294 // 2295 // Or translate following interleaved store group (factor = 3): 2296 // for (i = 0; i < N; i+=3) { 2297 // ... do something to R, G, B 2298 // Pic[i] = R; // Member of index 0 2299 // Pic[i+1] = G; // Member of index 1 2300 // Pic[i+2] = B; // Member of index 2 2301 // } 2302 // To: 2303 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2304 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2305 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2306 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2307 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2308 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2309 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2310 VPValue *Addr, VPValue *BlockInMask) { 2311 Instruction *Instr = Group->getInsertPos(); 2312 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2313 2314 // Prepare for the vector type of the interleaved load/store. 2315 Type *ScalarTy = getMemInstValueType(Instr); 2316 unsigned InterleaveFactor = Group->getFactor(); 2317 assert(!VF.Scalable && "scalable vectors not yet supported."); 2318 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2319 2320 // Prepare for the new pointers. 2321 SmallVector<Value *, 2> AddrParts; 2322 unsigned Index = Group->getIndex(Instr); 2323 2324 // TODO: extend the masked interleaved-group support to reversed access. 2325 assert((!BlockInMask || !Group->isReverse()) && 2326 "Reversed masked interleave-group not supported."); 2327 2328 // If the group is reverse, adjust the index to refer to the last vector lane 2329 // instead of the first. We adjust the index from the first vector lane, 2330 // rather than directly getting the pointer for lane VF - 1, because the 2331 // pointer operand of the interleaved access is supposed to be uniform. For 2332 // uniform instructions, we're only required to generate a value for the 2333 // first vector lane in each unroll iteration. 2334 assert(!VF.Scalable && 2335 "scalable vector reverse operation is not implemented"); 2336 if (Group->isReverse()) 2337 Index += (VF.Min - 1) * Group->getFactor(); 2338 2339 for (unsigned Part = 0; Part < UF; Part++) { 2340 Value *AddrPart = State.get(Addr, {Part, 0}); 2341 setDebugLocFromInst(Builder, AddrPart); 2342 2343 // Notice current instruction could be any index. Need to adjust the address 2344 // to the member of index 0. 2345 // 2346 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2347 // b = A[i]; // Member of index 0 2348 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2349 // 2350 // E.g. A[i+1] = a; // Member of index 1 2351 // A[i] = b; // Member of index 0 2352 // A[i+2] = c; // Member of index 2 (Current instruction) 2353 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2354 2355 bool InBounds = false; 2356 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2357 InBounds = gep->isInBounds(); 2358 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2359 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2360 2361 // Cast to the vector pointer type. 2362 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2363 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2364 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2365 } 2366 2367 setDebugLocFromInst(Builder, Instr); 2368 Value *UndefVec = UndefValue::get(VecTy); 2369 2370 Value *MaskForGaps = nullptr; 2371 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2372 assert(!VF.Scalable && "scalable vectors not yet supported."); 2373 MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); 2374 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2375 } 2376 2377 // Vectorize the interleaved load group. 2378 if (isa<LoadInst>(Instr)) { 2379 // For each unroll part, create a wide load for the group. 2380 SmallVector<Value *, 2> NewLoads; 2381 for (unsigned Part = 0; Part < UF; Part++) { 2382 Instruction *NewLoad; 2383 if (BlockInMask || MaskForGaps) { 2384 assert(useMaskedInterleavedAccesses(*TTI) && 2385 "masked interleaved groups are not allowed."); 2386 Value *GroupMask = MaskForGaps; 2387 if (BlockInMask) { 2388 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2389 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2390 assert(!VF.Scalable && "scalable vectors not yet supported."); 2391 Value *ShuffledMask = Builder.CreateShuffleVector( 2392 BlockInMaskPart, Undefs, 2393 createReplicatedMask(InterleaveFactor, VF.Min), 2394 "interleaved.mask"); 2395 GroupMask = MaskForGaps 2396 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2397 MaskForGaps) 2398 : ShuffledMask; 2399 } 2400 NewLoad = 2401 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2402 GroupMask, UndefVec, "wide.masked.vec"); 2403 } 2404 else 2405 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2406 Group->getAlign(), "wide.vec"); 2407 Group->addMetadata(NewLoad); 2408 NewLoads.push_back(NewLoad); 2409 } 2410 2411 // For each member in the group, shuffle out the appropriate data from the 2412 // wide loads. 2413 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2414 Instruction *Member = Group->getMember(I); 2415 2416 // Skip the gaps in the group. 2417 if (!Member) 2418 continue; 2419 2420 assert(!VF.Scalable && "scalable vectors not yet supported."); 2421 auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); 2422 for (unsigned Part = 0; Part < UF; Part++) { 2423 Value *StridedVec = Builder.CreateShuffleVector( 2424 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2425 2426 // If this member has different type, cast the result type. 2427 if (Member->getType() != ScalarTy) { 2428 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2429 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2430 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2431 } 2432 2433 if (Group->isReverse()) 2434 StridedVec = reverseVector(StridedVec); 2435 2436 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2437 } 2438 } 2439 return; 2440 } 2441 2442 // The sub vector type for current instruction. 2443 assert(!VF.Scalable && "VF is assumed to be non scalable."); 2444 auto *SubVT = VectorType::get(ScalarTy, VF); 2445 2446 // Vectorize the interleaved store group. 2447 for (unsigned Part = 0; Part < UF; Part++) { 2448 // Collect the stored vector from each member. 2449 SmallVector<Value *, 4> StoredVecs; 2450 for (unsigned i = 0; i < InterleaveFactor; i++) { 2451 // Interleaved store group doesn't allow a gap, so each index has a member 2452 Instruction *Member = Group->getMember(i); 2453 assert(Member && "Fail to get a member from an interleaved store group"); 2454 2455 Value *StoredVec = getOrCreateVectorValue( 2456 cast<StoreInst>(Member)->getValueOperand(), Part); 2457 if (Group->isReverse()) 2458 StoredVec = reverseVector(StoredVec); 2459 2460 // If this member has different type, cast it to a unified type. 2461 2462 if (StoredVec->getType() != SubVT) 2463 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2464 2465 StoredVecs.push_back(StoredVec); 2466 } 2467 2468 // Concatenate all vectors into a wide vector. 2469 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2470 2471 // Interleave the elements in the wide vector. 2472 assert(!VF.Scalable && "scalable vectors not yet supported."); 2473 Value *IVec = Builder.CreateShuffleVector( 2474 WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), 2475 "interleaved.vec"); 2476 2477 Instruction *NewStoreInstr; 2478 if (BlockInMask) { 2479 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2480 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2481 Value *ShuffledMask = Builder.CreateShuffleVector( 2482 BlockInMaskPart, Undefs, 2483 createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); 2484 NewStoreInstr = Builder.CreateMaskedStore( 2485 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2486 } 2487 else 2488 NewStoreInstr = 2489 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2490 2491 Group->addMetadata(NewStoreInstr); 2492 } 2493 } 2494 2495 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2496 VPTransformState &State, 2497 VPValue *Addr, 2498 VPValue *StoredValue, 2499 VPValue *BlockInMask) { 2500 // Attempt to issue a wide load. 2501 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2502 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2503 2504 assert((LI || SI) && "Invalid Load/Store instruction"); 2505 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2506 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2507 2508 LoopVectorizationCostModel::InstWidening Decision = 2509 Cost->getWideningDecision(Instr, VF); 2510 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2511 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2512 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2513 "CM decision is not to widen the memory instruction"); 2514 2515 Type *ScalarDataTy = getMemInstValueType(Instr); 2516 2517 assert(!VF.Scalable && "scalable vectors not yet supported."); 2518 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2519 const Align Alignment = getLoadStoreAlignment(Instr); 2520 2521 // Determine if the pointer operand of the access is either consecutive or 2522 // reverse consecutive. 2523 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2524 bool ConsecutiveStride = 2525 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2526 bool CreateGatherScatter = 2527 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2528 2529 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2530 // gather/scatter. Otherwise Decision should have been to Scalarize. 2531 assert((ConsecutiveStride || CreateGatherScatter) && 2532 "The instruction should be scalarized"); 2533 (void)ConsecutiveStride; 2534 2535 VectorParts BlockInMaskParts(UF); 2536 bool isMaskRequired = BlockInMask; 2537 if (isMaskRequired) 2538 for (unsigned Part = 0; Part < UF; ++Part) 2539 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2540 2541 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2542 // Calculate the pointer for the specific unroll-part. 2543 GetElementPtrInst *PartPtr = nullptr; 2544 2545 bool InBounds = false; 2546 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2547 InBounds = gep->isInBounds(); 2548 2549 if (Reverse) { 2550 // If the address is consecutive but reversed, then the 2551 // wide store needs to start at the last vector element. 2552 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2553 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); 2554 PartPtr->setIsInBounds(InBounds); 2555 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2556 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); 2557 PartPtr->setIsInBounds(InBounds); 2558 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2559 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2560 } else { 2561 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2562 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); 2563 PartPtr->setIsInBounds(InBounds); 2564 } 2565 2566 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2567 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2568 }; 2569 2570 // Handle Stores: 2571 if (SI) { 2572 setDebugLocFromInst(Builder, SI); 2573 2574 for (unsigned Part = 0; Part < UF; ++Part) { 2575 Instruction *NewSI = nullptr; 2576 Value *StoredVal = State.get(StoredValue, Part); 2577 if (CreateGatherScatter) { 2578 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2579 Value *VectorGep = State.get(Addr, Part); 2580 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2581 MaskPart); 2582 } else { 2583 if (Reverse) { 2584 // If we store to reverse consecutive memory locations, then we need 2585 // to reverse the order of elements in the stored value. 2586 StoredVal = reverseVector(StoredVal); 2587 // We don't want to update the value in the map as it might be used in 2588 // another expression. So don't call resetVectorValue(StoredVal). 2589 } 2590 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2591 if (isMaskRequired) 2592 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2593 BlockInMaskParts[Part]); 2594 else 2595 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2596 } 2597 addMetadata(NewSI, SI); 2598 } 2599 return; 2600 } 2601 2602 // Handle loads. 2603 assert(LI && "Must have a load instruction"); 2604 setDebugLocFromInst(Builder, LI); 2605 for (unsigned Part = 0; Part < UF; ++Part) { 2606 Value *NewLI; 2607 if (CreateGatherScatter) { 2608 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2609 Value *VectorGep = State.get(Addr, Part); 2610 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2611 nullptr, "wide.masked.gather"); 2612 addMetadata(NewLI, LI); 2613 } else { 2614 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2615 if (isMaskRequired) 2616 NewLI = Builder.CreateMaskedLoad( 2617 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2618 "wide.masked.load"); 2619 else 2620 NewLI = 2621 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2622 2623 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2624 addMetadata(NewLI, LI); 2625 if (Reverse) 2626 NewLI = reverseVector(NewLI); 2627 } 2628 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2629 } 2630 } 2631 2632 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2633 const VPIteration &Instance, 2634 bool IfPredicateInstr, 2635 VPTransformState &State) { 2636 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2637 2638 setDebugLocFromInst(Builder, Instr); 2639 2640 // Does this instruction return a value ? 2641 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2642 2643 Instruction *Cloned = Instr->clone(); 2644 if (!IsVoidRetTy) 2645 Cloned->setName(Instr->getName() + ".cloned"); 2646 2647 // Replace the operands of the cloned instructions with their scalar 2648 // equivalents in the new loop. 2649 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2650 auto *NewOp = State.get(User.getOperand(op), Instance); 2651 Cloned->setOperand(op, NewOp); 2652 } 2653 addNewMetadata(Cloned, Instr); 2654 2655 // Place the cloned scalar in the new loop. 2656 Builder.Insert(Cloned); 2657 2658 // Add the cloned scalar to the scalar map entry. 2659 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2660 2661 // If we just cloned a new assumption, add it the assumption cache. 2662 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2663 if (II->getIntrinsicID() == Intrinsic::assume) 2664 AC->registerAssumption(II); 2665 2666 // End if-block. 2667 if (IfPredicateInstr) 2668 PredicatedInstructions.push_back(Cloned); 2669 } 2670 2671 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2672 Value *End, Value *Step, 2673 Instruction *DL) { 2674 BasicBlock *Header = L->getHeader(); 2675 BasicBlock *Latch = L->getLoopLatch(); 2676 // As we're just creating this loop, it's possible no latch exists 2677 // yet. If so, use the header as this will be a single block loop. 2678 if (!Latch) 2679 Latch = Header; 2680 2681 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2682 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2683 setDebugLocFromInst(Builder, OldInst); 2684 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2685 2686 Builder.SetInsertPoint(Latch->getTerminator()); 2687 setDebugLocFromInst(Builder, OldInst); 2688 2689 // Create i+1 and fill the PHINode. 2690 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2691 Induction->addIncoming(Start, L->getLoopPreheader()); 2692 Induction->addIncoming(Next, Latch); 2693 // Create the compare. 2694 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2695 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2696 2697 // Now we have two terminators. Remove the old one from the block. 2698 Latch->getTerminator()->eraseFromParent(); 2699 2700 return Induction; 2701 } 2702 2703 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2704 if (TripCount) 2705 return TripCount; 2706 2707 assert(L && "Create Trip Count for null loop."); 2708 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2709 // Find the loop boundaries. 2710 ScalarEvolution *SE = PSE.getSE(); 2711 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2712 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2713 "Invalid loop count"); 2714 2715 Type *IdxTy = Legal->getWidestInductionType(); 2716 assert(IdxTy && "No type for induction"); 2717 2718 // The exit count might have the type of i64 while the phi is i32. This can 2719 // happen if we have an induction variable that is sign extended before the 2720 // compare. The only way that we get a backedge taken count is that the 2721 // induction variable was signed and as such will not overflow. In such a case 2722 // truncation is legal. 2723 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2724 IdxTy->getPrimitiveSizeInBits()) 2725 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2726 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2727 2728 // Get the total trip count from the count by adding 1. 2729 const SCEV *ExitCount = SE->getAddExpr( 2730 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2731 2732 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2733 2734 // Expand the trip count and place the new instructions in the preheader. 2735 // Notice that the pre-header does not change, only the loop body. 2736 SCEVExpander Exp(*SE, DL, "induction"); 2737 2738 // Count holds the overall loop count (N). 2739 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2740 L->getLoopPreheader()->getTerminator()); 2741 2742 if (TripCount->getType()->isPointerTy()) 2743 TripCount = 2744 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2745 L->getLoopPreheader()->getTerminator()); 2746 2747 return TripCount; 2748 } 2749 2750 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2751 if (VectorTripCount) 2752 return VectorTripCount; 2753 2754 Value *TC = getOrCreateTripCount(L); 2755 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2756 2757 Type *Ty = TC->getType(); 2758 // This is where we can make the step a runtime constant. 2759 assert(!VF.Scalable && "scalable vectorization is not supported yet"); 2760 Constant *Step = ConstantInt::get(Ty, VF.Min * UF); 2761 2762 // If the tail is to be folded by masking, round the number of iterations N 2763 // up to a multiple of Step instead of rounding down. This is done by first 2764 // adding Step-1 and then rounding down. Note that it's ok if this addition 2765 // overflows: the vector induction variable will eventually wrap to zero given 2766 // that it starts at zero and its Step is a power of two; the loop will then 2767 // exit, with the last early-exit vector comparison also producing all-true. 2768 if (Cost->foldTailByMasking()) { 2769 assert(isPowerOf2_32(VF.Min * UF) && 2770 "VF*UF must be a power of 2 when folding tail by masking"); 2771 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), 2772 "n.rnd.up"); 2773 } 2774 2775 // Now we need to generate the expression for the part of the loop that the 2776 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2777 // iterations are not required for correctness, or N - Step, otherwise. Step 2778 // is equal to the vectorization factor (number of SIMD elements) times the 2779 // unroll factor (number of SIMD instructions). 2780 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2781 2782 // If there is a non-reversed interleaved group that may speculatively access 2783 // memory out-of-bounds, we need to ensure that there will be at least one 2784 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2785 // the trip count, we set the remainder to be equal to the step. If the step 2786 // does not evenly divide the trip count, no adjustment is necessary since 2787 // there will already be scalar iterations. Note that the minimum iterations 2788 // check ensures that N >= Step. 2789 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2790 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2791 R = Builder.CreateSelect(IsZero, Step, R); 2792 } 2793 2794 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2795 2796 return VectorTripCount; 2797 } 2798 2799 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2800 const DataLayout &DL) { 2801 // Verify that V is a vector type with same number of elements as DstVTy. 2802 assert(isa<FixedVectorType>(DstVTy) && 2803 "Vector type is assumed to be fixed width."); 2804 unsigned VF = DstVTy->getNumElements(); 2805 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2806 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2807 Type *SrcElemTy = SrcVecTy->getElementType(); 2808 Type *DstElemTy = DstVTy->getElementType(); 2809 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2810 "Vector elements must have same size"); 2811 2812 // Do a direct cast if element types are castable. 2813 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2814 return Builder.CreateBitOrPointerCast(V, DstVTy); 2815 } 2816 // V cannot be directly casted to desired vector type. 2817 // May happen when V is a floating point vector but DstVTy is a vector of 2818 // pointers or vice-versa. Handle this using a two-step bitcast using an 2819 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2820 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2821 "Only one type should be a pointer type"); 2822 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2823 "Only one type should be a floating point type"); 2824 Type *IntTy = 2825 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2826 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2827 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2828 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2829 } 2830 2831 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2832 BasicBlock *Bypass) { 2833 Value *Count = getOrCreateTripCount(L); 2834 // Reuse existing vector loop preheader for TC checks. 2835 // Note that new preheader block is generated for vector loop. 2836 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2837 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2838 2839 // Generate code to check if the loop's trip count is less than VF * UF, or 2840 // equal to it in case a scalar epilogue is required; this implies that the 2841 // vector trip count is zero. This check also covers the case where adding one 2842 // to the backedge-taken count overflowed leading to an incorrect trip count 2843 // of zero. In this case we will also jump to the scalar loop. 2844 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2845 : ICmpInst::ICMP_ULT; 2846 2847 // If tail is to be folded, vector loop takes care of all iterations. 2848 Value *CheckMinIters = Builder.getFalse(); 2849 if (!Cost->foldTailByMasking()) { 2850 assert(!VF.Scalable && "scalable vectors not yet supported."); 2851 CheckMinIters = Builder.CreateICmp( 2852 P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), 2853 "min.iters.check"); 2854 } 2855 // Create new preheader for vector loop. 2856 LoopVectorPreHeader = 2857 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2858 "vector.ph"); 2859 2860 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2861 DT->getNode(Bypass)->getIDom()) && 2862 "TC check is expected to dominate Bypass"); 2863 2864 // Update dominator for Bypass & LoopExit. 2865 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2866 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2867 2868 ReplaceInstWithInst( 2869 TCCheckBlock->getTerminator(), 2870 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2871 LoopBypassBlocks.push_back(TCCheckBlock); 2872 } 2873 2874 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2875 // Reuse existing vector loop preheader for SCEV checks. 2876 // Note that new preheader block is generated for vector loop. 2877 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2878 2879 // Generate the code to check that the SCEV assumptions that we made. 2880 // We want the new basic block to start at the first instruction in a 2881 // sequence of instructions that form a check. 2882 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2883 "scev.check"); 2884 Value *SCEVCheck = Exp.expandCodeForPredicate( 2885 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2886 2887 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2888 if (C->isZero()) 2889 return; 2890 2891 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2892 (OptForSizeBasedOnProfile && 2893 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2894 "Cannot SCEV check stride or overflow when optimizing for size"); 2895 2896 SCEVCheckBlock->setName("vector.scevcheck"); 2897 // Create new preheader for vector loop. 2898 LoopVectorPreHeader = 2899 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2900 nullptr, "vector.ph"); 2901 2902 // Update dominator only if this is first RT check. 2903 if (LoopBypassBlocks.empty()) { 2904 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2905 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2906 } 2907 2908 ReplaceInstWithInst( 2909 SCEVCheckBlock->getTerminator(), 2910 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2911 LoopBypassBlocks.push_back(SCEVCheckBlock); 2912 AddedSafetyChecks = true; 2913 } 2914 2915 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2916 // VPlan-native path does not do any analysis for runtime checks currently. 2917 if (EnableVPlanNativePath) 2918 return; 2919 2920 // Reuse existing vector loop preheader for runtime memory checks. 2921 // Note that new preheader block is generated for vector loop. 2922 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2923 2924 // Generate the code that checks in runtime if arrays overlap. We put the 2925 // checks into a separate block to make the more common case of few elements 2926 // faster. 2927 auto *LAI = Legal->getLAI(); 2928 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2929 if (!RtPtrChecking.Need) 2930 return; 2931 Instruction *FirstCheckInst; 2932 Instruction *MemRuntimeCheck; 2933 std::tie(FirstCheckInst, MemRuntimeCheck) = 2934 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2935 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2936 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2937 "claimed checks are required"); 2938 2939 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2940 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2941 "Cannot emit memory checks when optimizing for size, unless forced " 2942 "to vectorize."); 2943 ORE->emit([&]() { 2944 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2945 L->getStartLoc(), L->getHeader()) 2946 << "Code-size may be reduced by not forcing " 2947 "vectorization, or by source-code modifications " 2948 "eliminating the need for runtime checks " 2949 "(e.g., adding 'restrict')."; 2950 }); 2951 } 2952 2953 MemCheckBlock->setName("vector.memcheck"); 2954 // Create new preheader for vector loop. 2955 LoopVectorPreHeader = 2956 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2957 "vector.ph"); 2958 2959 // Update dominator only if this is first RT check. 2960 if (LoopBypassBlocks.empty()) { 2961 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2962 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2963 } 2964 2965 ReplaceInstWithInst( 2966 MemCheckBlock->getTerminator(), 2967 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2968 LoopBypassBlocks.push_back(MemCheckBlock); 2969 AddedSafetyChecks = true; 2970 2971 // We currently don't use LoopVersioning for the actual loop cloning but we 2972 // still use it to add the noalias metadata. 2973 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2974 PSE.getSE()); 2975 LVer->prepareNoAliasMetadata(); 2976 } 2977 2978 Value *InnerLoopVectorizer::emitTransformedIndex( 2979 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2980 const InductionDescriptor &ID) const { 2981 2982 SCEVExpander Exp(*SE, DL, "induction"); 2983 auto Step = ID.getStep(); 2984 auto StartValue = ID.getStartValue(); 2985 assert(Index->getType() == Step->getType() && 2986 "Index type does not match StepValue type"); 2987 2988 // Note: the IR at this point is broken. We cannot use SE to create any new 2989 // SCEV and then expand it, hoping that SCEV's simplification will give us 2990 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2991 // lead to various SCEV crashes. So all we can do is to use builder and rely 2992 // on InstCombine for future simplifications. Here we handle some trivial 2993 // cases only. 2994 auto CreateAdd = [&B](Value *X, Value *Y) { 2995 assert(X->getType() == Y->getType() && "Types don't match!"); 2996 if (auto *CX = dyn_cast<ConstantInt>(X)) 2997 if (CX->isZero()) 2998 return Y; 2999 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3000 if (CY->isZero()) 3001 return X; 3002 return B.CreateAdd(X, Y); 3003 }; 3004 3005 auto CreateMul = [&B](Value *X, Value *Y) { 3006 assert(X->getType() == Y->getType() && "Types don't match!"); 3007 if (auto *CX = dyn_cast<ConstantInt>(X)) 3008 if (CX->isOne()) 3009 return Y; 3010 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3011 if (CY->isOne()) 3012 return X; 3013 return B.CreateMul(X, Y); 3014 }; 3015 3016 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3017 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3018 // the DomTree is not kept up-to-date for additional blocks generated in the 3019 // vector loop. By using the header as insertion point, we guarantee that the 3020 // expanded instructions dominate all their uses. 3021 auto GetInsertPoint = [this, &B]() { 3022 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3023 if (InsertBB != LoopVectorBody && 3024 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3025 return LoopVectorBody->getTerminator(); 3026 return &*B.GetInsertPoint(); 3027 }; 3028 switch (ID.getKind()) { 3029 case InductionDescriptor::IK_IntInduction: { 3030 assert(Index->getType() == StartValue->getType() && 3031 "Index type does not match StartValue type"); 3032 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3033 return B.CreateSub(StartValue, Index); 3034 auto *Offset = CreateMul( 3035 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3036 return CreateAdd(StartValue, Offset); 3037 } 3038 case InductionDescriptor::IK_PtrInduction: { 3039 assert(isa<SCEVConstant>(Step) && 3040 "Expected constant step for pointer induction"); 3041 return B.CreateGEP( 3042 StartValue->getType()->getPointerElementType(), StartValue, 3043 CreateMul(Index, 3044 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3045 } 3046 case InductionDescriptor::IK_FpInduction: { 3047 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3048 auto InductionBinOp = ID.getInductionBinOp(); 3049 assert(InductionBinOp && 3050 (InductionBinOp->getOpcode() == Instruction::FAdd || 3051 InductionBinOp->getOpcode() == Instruction::FSub) && 3052 "Original bin op should be defined for FP induction"); 3053 3054 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3055 3056 // Floating point operations had to be 'fast' to enable the induction. 3057 FastMathFlags Flags; 3058 Flags.setFast(); 3059 3060 Value *MulExp = B.CreateFMul(StepValue, Index); 3061 if (isa<Instruction>(MulExp)) 3062 // We have to check, the MulExp may be a constant. 3063 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3064 3065 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3066 "induction"); 3067 if (isa<Instruction>(BOp)) 3068 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3069 3070 return BOp; 3071 } 3072 case InductionDescriptor::IK_NoInduction: 3073 return nullptr; 3074 } 3075 llvm_unreachable("invalid enum"); 3076 } 3077 3078 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3079 LoopScalarBody = OrigLoop->getHeader(); 3080 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3081 LoopExitBlock = OrigLoop->getExitBlock(); 3082 assert(LoopExitBlock && "Must have an exit block"); 3083 assert(LoopVectorPreHeader && "Invalid loop structure"); 3084 3085 LoopMiddleBlock = 3086 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3087 LI, nullptr, Twine(Prefix) + "middle.block"); 3088 LoopScalarPreHeader = 3089 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3090 nullptr, Twine(Prefix) + "scalar.ph"); 3091 // We intentionally don't let SplitBlock to update LoopInfo since 3092 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3093 // LoopVectorBody is explicitly added to the correct place few lines later. 3094 LoopVectorBody = 3095 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3096 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3097 3098 // Update dominator for loop exit. 3099 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3100 3101 // Create and register the new vector loop. 3102 Loop *Lp = LI->AllocateLoop(); 3103 Loop *ParentLoop = OrigLoop->getParentLoop(); 3104 3105 // Insert the new loop into the loop nest and register the new basic blocks 3106 // before calling any utilities such as SCEV that require valid LoopInfo. 3107 if (ParentLoop) { 3108 ParentLoop->addChildLoop(Lp); 3109 } else { 3110 LI->addTopLevelLoop(Lp); 3111 } 3112 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3113 return Lp; 3114 } 3115 3116 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3117 Value *VectorTripCount) { 3118 assert(VectorTripCount && L && "Expected valid arguments"); 3119 // We are going to resume the execution of the scalar loop. 3120 // Go over all of the induction variables that we found and fix the 3121 // PHIs that are left in the scalar version of the loop. 3122 // The starting values of PHI nodes depend on the counter of the last 3123 // iteration in the vectorized loop. 3124 // If we come from a bypass edge then we need to start from the original 3125 // start value. 3126 for (auto &InductionEntry : Legal->getInductionVars()) { 3127 PHINode *OrigPhi = InductionEntry.first; 3128 InductionDescriptor II = InductionEntry.second; 3129 3130 // Create phi nodes to merge from the backedge-taken check block. 3131 PHINode *BCResumeVal = 3132 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3133 LoopScalarPreHeader->getTerminator()); 3134 // Copy original phi DL over to the new one. 3135 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3136 Value *&EndValue = IVEndValues[OrigPhi]; 3137 if (OrigPhi == OldInduction) { 3138 // We know what the end value is. 3139 EndValue = VectorTripCount; 3140 } else { 3141 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3142 Type *StepType = II.getStep()->getType(); 3143 Instruction::CastOps CastOp = 3144 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3145 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3146 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3147 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3148 EndValue->setName("ind.end"); 3149 } 3150 3151 // The new PHI merges the original incoming value, in case of a bypass, 3152 // or the value at the end of the vectorized loop. 3153 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3154 3155 // Fix the scalar body counter (PHI node). 3156 // The old induction's phi node in the scalar body needs the truncated 3157 // value. 3158 for (BasicBlock *BB : LoopBypassBlocks) 3159 BCResumeVal->addIncoming(II.getStartValue(), BB); 3160 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3161 } 3162 } 3163 3164 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3165 MDNode *OrigLoopID) { 3166 assert(L && "Expected valid loop."); 3167 3168 // The trip counts should be cached by now. 3169 Value *Count = getOrCreateTripCount(L); 3170 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3171 3172 // We need the OrigLoop (scalar loop part) latch terminator to help 3173 // produce correct debug info for the middle block BB instructions. 3174 // The legality check stage guarantees that the loop will have a single 3175 // latch. 3176 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3177 "Scalar loop latch terminator isn't a branch"); 3178 BranchInst *ScalarLatchBr = 3179 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3180 3181 // Add a check in the middle block to see if we have completed 3182 // all of the iterations in the first vector loop. 3183 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3184 // If tail is to be folded, we know we don't need to run the remainder. 3185 Value *CmpN = Builder.getTrue(); 3186 if (!Cost->foldTailByMasking()) { 3187 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3188 VectorTripCount, "cmp.n", 3189 LoopMiddleBlock->getTerminator()); 3190 3191 // Here we use the same DebugLoc as the scalar loop latch branch instead 3192 // of the corresponding compare because they may have ended up with 3193 // different line numbers and we want to avoid awkward line stepping while 3194 // debugging. Eg. if the compare has got a line number inside the loop. 3195 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3196 } 3197 3198 BranchInst *BrInst = 3199 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3200 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3201 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3202 3203 // Get ready to start creating new instructions into the vectorized body. 3204 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3205 "Inconsistent vector loop preheader"); 3206 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3207 3208 Optional<MDNode *> VectorizedLoopID = 3209 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3210 LLVMLoopVectorizeFollowupVectorized}); 3211 if (VectorizedLoopID.hasValue()) { 3212 L->setLoopID(VectorizedLoopID.getValue()); 3213 3214 // Do not setAlreadyVectorized if loop attributes have been defined 3215 // explicitly. 3216 return LoopVectorPreHeader; 3217 } 3218 3219 // Keep all loop hints from the original loop on the vector loop (we'll 3220 // replace the vectorizer-specific hints below). 3221 if (MDNode *LID = OrigLoop->getLoopID()) 3222 L->setLoopID(LID); 3223 3224 LoopVectorizeHints Hints(L, true, *ORE); 3225 Hints.setAlreadyVectorized(); 3226 3227 #ifdef EXPENSIVE_CHECKS 3228 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3229 LI->verify(*DT); 3230 #endif 3231 3232 return LoopVectorPreHeader; 3233 } 3234 3235 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3236 /* 3237 In this function we generate a new loop. The new loop will contain 3238 the vectorized instructions while the old loop will continue to run the 3239 scalar remainder. 3240 3241 [ ] <-- loop iteration number check. 3242 / | 3243 / v 3244 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3245 | / | 3246 | / v 3247 || [ ] <-- vector pre header. 3248 |/ | 3249 | v 3250 | [ ] \ 3251 | [ ]_| <-- vector loop. 3252 | | 3253 | v 3254 | -[ ] <--- middle-block. 3255 | / | 3256 | / v 3257 -|- >[ ] <--- new preheader. 3258 | | 3259 | v 3260 | [ ] \ 3261 | [ ]_| <-- old scalar loop to handle remainder. 3262 \ | 3263 \ v 3264 >[ ] <-- exit block. 3265 ... 3266 */ 3267 3268 // Get the metadata of the original loop before it gets modified. 3269 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3270 3271 // Create an empty vector loop, and prepare basic blocks for the runtime 3272 // checks. 3273 Loop *Lp = createVectorLoopSkeleton(""); 3274 3275 // Now, compare the new count to zero. If it is zero skip the vector loop and 3276 // jump to the scalar loop. This check also covers the case where the 3277 // backedge-taken count is uint##_max: adding one to it will overflow leading 3278 // to an incorrect trip count of zero. In this (rare) case we will also jump 3279 // to the scalar loop. 3280 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3281 3282 // Generate the code to check any assumptions that we've made for SCEV 3283 // expressions. 3284 emitSCEVChecks(Lp, LoopScalarPreHeader); 3285 3286 // Generate the code that checks in runtime if arrays overlap. We put the 3287 // checks into a separate block to make the more common case of few elements 3288 // faster. 3289 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3290 3291 // Some loops have a single integer induction variable, while other loops 3292 // don't. One example is c++ iterators that often have multiple pointer 3293 // induction variables. In the code below we also support a case where we 3294 // don't have a single induction variable. 3295 // 3296 // We try to obtain an induction variable from the original loop as hard 3297 // as possible. However if we don't find one that: 3298 // - is an integer 3299 // - counts from zero, stepping by one 3300 // - is the size of the widest induction variable type 3301 // then we create a new one. 3302 OldInduction = Legal->getPrimaryInduction(); 3303 Type *IdxTy = Legal->getWidestInductionType(); 3304 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3305 // The loop step is equal to the vectorization factor (num of SIMD elements) 3306 // times the unroll factor (num of SIMD instructions). 3307 assert(!VF.Scalable && "scalable vectors not yet supported."); 3308 Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); 3309 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3310 Induction = 3311 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3312 getDebugLocFromInstOrOperands(OldInduction)); 3313 3314 // Emit phis for the new starting index of the scalar loop. 3315 createInductionResumeValues(Lp, CountRoundDown); 3316 3317 return completeLoopSkeleton(Lp, OrigLoopID); 3318 } 3319 3320 // Fix up external users of the induction variable. At this point, we are 3321 // in LCSSA form, with all external PHIs that use the IV having one input value, 3322 // coming from the remainder loop. We need those PHIs to also have a correct 3323 // value for the IV when arriving directly from the middle block. 3324 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3325 const InductionDescriptor &II, 3326 Value *CountRoundDown, Value *EndValue, 3327 BasicBlock *MiddleBlock) { 3328 // There are two kinds of external IV usages - those that use the value 3329 // computed in the last iteration (the PHI) and those that use the penultimate 3330 // value (the value that feeds into the phi from the loop latch). 3331 // We allow both, but they, obviously, have different values. 3332 3333 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3334 3335 DenseMap<Value *, Value *> MissingVals; 3336 3337 // An external user of the last iteration's value should see the value that 3338 // the remainder loop uses to initialize its own IV. 3339 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3340 for (User *U : PostInc->users()) { 3341 Instruction *UI = cast<Instruction>(U); 3342 if (!OrigLoop->contains(UI)) { 3343 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3344 MissingVals[UI] = EndValue; 3345 } 3346 } 3347 3348 // An external user of the penultimate value need to see EndValue - Step. 3349 // The simplest way to get this is to recompute it from the constituent SCEVs, 3350 // that is Start + (Step * (CRD - 1)). 3351 for (User *U : OrigPhi->users()) { 3352 auto *UI = cast<Instruction>(U); 3353 if (!OrigLoop->contains(UI)) { 3354 const DataLayout &DL = 3355 OrigLoop->getHeader()->getModule()->getDataLayout(); 3356 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3357 3358 IRBuilder<> B(MiddleBlock->getTerminator()); 3359 Value *CountMinusOne = B.CreateSub( 3360 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3361 Value *CMO = 3362 !II.getStep()->getType()->isIntegerTy() 3363 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3364 II.getStep()->getType()) 3365 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3366 CMO->setName("cast.cmo"); 3367 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3368 Escape->setName("ind.escape"); 3369 MissingVals[UI] = Escape; 3370 } 3371 } 3372 3373 for (auto &I : MissingVals) { 3374 PHINode *PHI = cast<PHINode>(I.first); 3375 // One corner case we have to handle is two IVs "chasing" each-other, 3376 // that is %IV2 = phi [...], [ %IV1, %latch ] 3377 // In this case, if IV1 has an external use, we need to avoid adding both 3378 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3379 // don't already have an incoming value for the middle block. 3380 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3381 PHI->addIncoming(I.second, MiddleBlock); 3382 } 3383 } 3384 3385 namespace { 3386 3387 struct CSEDenseMapInfo { 3388 static bool canHandle(const Instruction *I) { 3389 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3390 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3391 } 3392 3393 static inline Instruction *getEmptyKey() { 3394 return DenseMapInfo<Instruction *>::getEmptyKey(); 3395 } 3396 3397 static inline Instruction *getTombstoneKey() { 3398 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3399 } 3400 3401 static unsigned getHashValue(const Instruction *I) { 3402 assert(canHandle(I) && "Unknown instruction!"); 3403 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3404 I->value_op_end())); 3405 } 3406 3407 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3408 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3409 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3410 return LHS == RHS; 3411 return LHS->isIdenticalTo(RHS); 3412 } 3413 }; 3414 3415 } // end anonymous namespace 3416 3417 ///Perform cse of induction variable instructions. 3418 static void cse(BasicBlock *BB) { 3419 // Perform simple cse. 3420 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3421 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3422 Instruction *In = &*I++; 3423 3424 if (!CSEDenseMapInfo::canHandle(In)) 3425 continue; 3426 3427 // Check if we can replace this instruction with any of the 3428 // visited instructions. 3429 if (Instruction *V = CSEMap.lookup(In)) { 3430 In->replaceAllUsesWith(V); 3431 In->eraseFromParent(); 3432 continue; 3433 } 3434 3435 CSEMap[In] = In; 3436 } 3437 } 3438 3439 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3440 ElementCount VF, 3441 bool &NeedToScalarize) { 3442 assert(!VF.Scalable && "scalable vectors not yet supported."); 3443 Function *F = CI->getCalledFunction(); 3444 Type *ScalarRetTy = CI->getType(); 3445 SmallVector<Type *, 4> Tys, ScalarTys; 3446 for (auto &ArgOp : CI->arg_operands()) 3447 ScalarTys.push_back(ArgOp->getType()); 3448 3449 // Estimate cost of scalarized vector call. The source operands are assumed 3450 // to be vectors, so we need to extract individual elements from there, 3451 // execute VF scalar calls, and then gather the result into the vector return 3452 // value. 3453 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3454 TTI::TCK_RecipThroughput); 3455 if (VF.isScalar()) 3456 return ScalarCallCost; 3457 3458 // Compute corresponding vector type for return value and arguments. 3459 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3460 for (Type *ScalarTy : ScalarTys) 3461 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3462 3463 // Compute costs of unpacking argument values for the scalar calls and 3464 // packing the return values to a vector. 3465 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3466 3467 unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; 3468 3469 // If we can't emit a vector call for this function, then the currently found 3470 // cost is the cost we need to return. 3471 NeedToScalarize = true; 3472 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3473 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3474 3475 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3476 return Cost; 3477 3478 // If the corresponding vector cost is cheaper, return its cost. 3479 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3480 TTI::TCK_RecipThroughput); 3481 if (VectorCallCost < Cost) { 3482 NeedToScalarize = false; 3483 return VectorCallCost; 3484 } 3485 return Cost; 3486 } 3487 3488 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3489 ElementCount VF) { 3490 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3491 assert(ID && "Expected intrinsic call!"); 3492 3493 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3494 return TTI.getIntrinsicInstrCost(CostAttrs, 3495 TargetTransformInfo::TCK_RecipThroughput); 3496 } 3497 3498 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3499 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3500 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3501 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3502 } 3503 3504 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3505 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3506 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3507 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3508 } 3509 3510 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3511 // For every instruction `I` in MinBWs, truncate the operands, create a 3512 // truncated version of `I` and reextend its result. InstCombine runs 3513 // later and will remove any ext/trunc pairs. 3514 SmallPtrSet<Value *, 4> Erased; 3515 for (const auto &KV : Cost->getMinimalBitwidths()) { 3516 // If the value wasn't vectorized, we must maintain the original scalar 3517 // type. The absence of the value from VectorLoopValueMap indicates that it 3518 // wasn't vectorized. 3519 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3520 continue; 3521 for (unsigned Part = 0; Part < UF; ++Part) { 3522 Value *I = getOrCreateVectorValue(KV.first, Part); 3523 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3524 continue; 3525 Type *OriginalTy = I->getType(); 3526 Type *ScalarTruncatedTy = 3527 IntegerType::get(OriginalTy->getContext(), KV.second); 3528 auto *TruncatedTy = FixedVectorType::get( 3529 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements()); 3530 if (TruncatedTy == OriginalTy) 3531 continue; 3532 3533 IRBuilder<> B(cast<Instruction>(I)); 3534 auto ShrinkOperand = [&](Value *V) -> Value * { 3535 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3536 if (ZI->getSrcTy() == TruncatedTy) 3537 return ZI->getOperand(0); 3538 return B.CreateZExtOrTrunc(V, TruncatedTy); 3539 }; 3540 3541 // The actual instruction modification depends on the instruction type, 3542 // unfortunately. 3543 Value *NewI = nullptr; 3544 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3545 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3546 ShrinkOperand(BO->getOperand(1))); 3547 3548 // Any wrapping introduced by shrinking this operation shouldn't be 3549 // considered undefined behavior. So, we can't unconditionally copy 3550 // arithmetic wrapping flags to NewI. 3551 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3552 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3553 NewI = 3554 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3555 ShrinkOperand(CI->getOperand(1))); 3556 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3557 NewI = B.CreateSelect(SI->getCondition(), 3558 ShrinkOperand(SI->getTrueValue()), 3559 ShrinkOperand(SI->getFalseValue())); 3560 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3561 switch (CI->getOpcode()) { 3562 default: 3563 llvm_unreachable("Unhandled cast!"); 3564 case Instruction::Trunc: 3565 NewI = ShrinkOperand(CI->getOperand(0)); 3566 break; 3567 case Instruction::SExt: 3568 NewI = B.CreateSExtOrTrunc( 3569 CI->getOperand(0), 3570 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3571 break; 3572 case Instruction::ZExt: 3573 NewI = B.CreateZExtOrTrunc( 3574 CI->getOperand(0), 3575 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3576 break; 3577 } 3578 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3579 auto Elements0 = 3580 cast<VectorType>(SI->getOperand(0)->getType())->getNumElements(); 3581 auto *O0 = B.CreateZExtOrTrunc( 3582 SI->getOperand(0), 3583 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3584 auto Elements1 = 3585 cast<VectorType>(SI->getOperand(1)->getType())->getNumElements(); 3586 auto *O1 = B.CreateZExtOrTrunc( 3587 SI->getOperand(1), 3588 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3589 3590 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3591 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3592 // Don't do anything with the operands, just extend the result. 3593 continue; 3594 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3595 auto Elements = 3596 cast<VectorType>(IE->getOperand(0)->getType())->getNumElements(); 3597 auto *O0 = B.CreateZExtOrTrunc( 3598 IE->getOperand(0), 3599 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3600 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3601 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3602 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3603 auto Elements = 3604 cast<VectorType>(EE->getOperand(0)->getType())->getNumElements(); 3605 auto *O0 = B.CreateZExtOrTrunc( 3606 EE->getOperand(0), 3607 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3608 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3609 } else { 3610 // If we don't know what to do, be conservative and don't do anything. 3611 continue; 3612 } 3613 3614 // Lastly, extend the result. 3615 NewI->takeName(cast<Instruction>(I)); 3616 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3617 I->replaceAllUsesWith(Res); 3618 cast<Instruction>(I)->eraseFromParent(); 3619 Erased.insert(I); 3620 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3621 } 3622 } 3623 3624 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3625 for (const auto &KV : Cost->getMinimalBitwidths()) { 3626 // If the value wasn't vectorized, we must maintain the original scalar 3627 // type. The absence of the value from VectorLoopValueMap indicates that it 3628 // wasn't vectorized. 3629 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3630 continue; 3631 for (unsigned Part = 0; Part < UF; ++Part) { 3632 Value *I = getOrCreateVectorValue(KV.first, Part); 3633 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3634 if (Inst && Inst->use_empty()) { 3635 Value *NewI = Inst->getOperand(0); 3636 Inst->eraseFromParent(); 3637 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3638 } 3639 } 3640 } 3641 } 3642 3643 void InnerLoopVectorizer::fixVectorizedLoop() { 3644 // Insert truncates and extends for any truncated instructions as hints to 3645 // InstCombine. 3646 if (VF.isVector()) 3647 truncateToMinimalBitwidths(); 3648 3649 // Fix widened non-induction PHIs by setting up the PHI operands. 3650 if (OrigPHIsToFix.size()) { 3651 assert(EnableVPlanNativePath && 3652 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3653 fixNonInductionPHIs(); 3654 } 3655 3656 // At this point every instruction in the original loop is widened to a 3657 // vector form. Now we need to fix the recurrences in the loop. These PHI 3658 // nodes are currently empty because we did not want to introduce cycles. 3659 // This is the second stage of vectorizing recurrences. 3660 fixCrossIterationPHIs(); 3661 3662 // Forget the original basic block. 3663 PSE.getSE()->forgetLoop(OrigLoop); 3664 3665 // Fix-up external users of the induction variables. 3666 for (auto &Entry : Legal->getInductionVars()) 3667 fixupIVUsers(Entry.first, Entry.second, 3668 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3669 IVEndValues[Entry.first], LoopMiddleBlock); 3670 3671 fixLCSSAPHIs(); 3672 for (Instruction *PI : PredicatedInstructions) 3673 sinkScalarOperands(&*PI); 3674 3675 // Remove redundant induction instructions. 3676 cse(LoopVectorBody); 3677 3678 // Set/update profile weights for the vector and remainder loops as original 3679 // loop iterations are now distributed among them. Note that original loop 3680 // represented by LoopScalarBody becomes remainder loop after vectorization. 3681 // 3682 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3683 // end up getting slightly roughened result but that should be OK since 3684 // profile is not inherently precise anyway. Note also possible bypass of 3685 // vector code caused by legality checks is ignored, assigning all the weight 3686 // to the vector loop, optimistically. 3687 assert(!VF.Scalable && 3688 "cannot use scalable ElementCount to determine unroll factor"); 3689 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3690 LI->getLoopFor(LoopVectorBody), 3691 LI->getLoopFor(LoopScalarBody), VF.Min * UF); 3692 } 3693 3694 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3695 // In order to support recurrences we need to be able to vectorize Phi nodes. 3696 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3697 // stage #2: We now need to fix the recurrences by adding incoming edges to 3698 // the currently empty PHI nodes. At this point every instruction in the 3699 // original loop is widened to a vector form so we can use them to construct 3700 // the incoming edges. 3701 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3702 // Handle first-order recurrences and reductions that need to be fixed. 3703 if (Legal->isFirstOrderRecurrence(&Phi)) 3704 fixFirstOrderRecurrence(&Phi); 3705 else if (Legal->isReductionVariable(&Phi)) 3706 fixReduction(&Phi); 3707 } 3708 } 3709 3710 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3711 // This is the second phase of vectorizing first-order recurrences. An 3712 // overview of the transformation is described below. Suppose we have the 3713 // following loop. 3714 // 3715 // for (int i = 0; i < n; ++i) 3716 // b[i] = a[i] - a[i - 1]; 3717 // 3718 // There is a first-order recurrence on "a". For this loop, the shorthand 3719 // scalar IR looks like: 3720 // 3721 // scalar.ph: 3722 // s_init = a[-1] 3723 // br scalar.body 3724 // 3725 // scalar.body: 3726 // i = phi [0, scalar.ph], [i+1, scalar.body] 3727 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3728 // s2 = a[i] 3729 // b[i] = s2 - s1 3730 // br cond, scalar.body, ... 3731 // 3732 // In this example, s1 is a recurrence because it's value depends on the 3733 // previous iteration. In the first phase of vectorization, we created a 3734 // temporary value for s1. We now complete the vectorization and produce the 3735 // shorthand vector IR shown below (for VF = 4, UF = 1). 3736 // 3737 // vector.ph: 3738 // v_init = vector(..., ..., ..., a[-1]) 3739 // br vector.body 3740 // 3741 // vector.body 3742 // i = phi [0, vector.ph], [i+4, vector.body] 3743 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3744 // v2 = a[i, i+1, i+2, i+3]; 3745 // v3 = vector(v1(3), v2(0, 1, 2)) 3746 // b[i, i+1, i+2, i+3] = v2 - v3 3747 // br cond, vector.body, middle.block 3748 // 3749 // middle.block: 3750 // x = v2(3) 3751 // br scalar.ph 3752 // 3753 // scalar.ph: 3754 // s_init = phi [x, middle.block], [a[-1], otherwise] 3755 // br scalar.body 3756 // 3757 // After execution completes the vector loop, we extract the next value of 3758 // the recurrence (x) to use as the initial value in the scalar loop. 3759 3760 // Get the original loop preheader and single loop latch. 3761 auto *Preheader = OrigLoop->getLoopPreheader(); 3762 auto *Latch = OrigLoop->getLoopLatch(); 3763 3764 // Get the initial and previous values of the scalar recurrence. 3765 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3766 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3767 3768 // Create a vector from the initial value. 3769 auto *VectorInit = ScalarInit; 3770 if (VF.isVector()) { 3771 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3772 assert(!VF.Scalable && "VF is assumed to be non scalable."); 3773 VectorInit = Builder.CreateInsertElement( 3774 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3775 Builder.getInt32(VF.Min - 1), "vector.recur.init"); 3776 } 3777 3778 // We constructed a temporary phi node in the first phase of vectorization. 3779 // This phi node will eventually be deleted. 3780 Builder.SetInsertPoint( 3781 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3782 3783 // Create a phi node for the new recurrence. The current value will either be 3784 // the initial value inserted into a vector or loop-varying vector value. 3785 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3786 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3787 3788 // Get the vectorized previous value of the last part UF - 1. It appears last 3789 // among all unrolled iterations, due to the order of their construction. 3790 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3791 3792 // Find and set the insertion point after the previous value if it is an 3793 // instruction. 3794 BasicBlock::iterator InsertPt; 3795 // Note that the previous value may have been constant-folded so it is not 3796 // guaranteed to be an instruction in the vector loop. 3797 // FIXME: Loop invariant values do not form recurrences. We should deal with 3798 // them earlier. 3799 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3800 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3801 else { 3802 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3803 if (isa<PHINode>(PreviousLastPart)) 3804 // If the previous value is a phi node, we should insert after all the phi 3805 // nodes in the block containing the PHI to avoid breaking basic block 3806 // verification. Note that the basic block may be different to 3807 // LoopVectorBody, in case we predicate the loop. 3808 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3809 else 3810 InsertPt = ++PreviousInst->getIterator(); 3811 } 3812 Builder.SetInsertPoint(&*InsertPt); 3813 3814 // We will construct a vector for the recurrence by combining the values for 3815 // the current and previous iterations. This is the required shuffle mask. 3816 assert(!VF.Scalable); 3817 SmallVector<int, 8> ShuffleMask(VF.Min); 3818 ShuffleMask[0] = VF.Min - 1; 3819 for (unsigned I = 1; I < VF.Min; ++I) 3820 ShuffleMask[I] = I + VF.Min - 1; 3821 3822 // The vector from which to take the initial value for the current iteration 3823 // (actual or unrolled). Initially, this is the vector phi node. 3824 Value *Incoming = VecPhi; 3825 3826 // Shuffle the current and previous vector and update the vector parts. 3827 for (unsigned Part = 0; Part < UF; ++Part) { 3828 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3829 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3830 auto *Shuffle = 3831 VF.isVector() 3832 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3833 : Incoming; 3834 PhiPart->replaceAllUsesWith(Shuffle); 3835 cast<Instruction>(PhiPart)->eraseFromParent(); 3836 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3837 Incoming = PreviousPart; 3838 } 3839 3840 // Fix the latch value of the new recurrence in the vector loop. 3841 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3842 3843 // Extract the last vector element in the middle block. This will be the 3844 // initial value for the recurrence when jumping to the scalar loop. 3845 auto *ExtractForScalar = Incoming; 3846 if (VF.isVector()) { 3847 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3848 ExtractForScalar = Builder.CreateExtractElement( 3849 ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); 3850 } 3851 // Extract the second last element in the middle block if the 3852 // Phi is used outside the loop. We need to extract the phi itself 3853 // and not the last element (the phi update in the current iteration). This 3854 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3855 // when the scalar loop is not run at all. 3856 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3857 if (VF.isVector()) 3858 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3859 Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); 3860 // When loop is unrolled without vectorizing, initialize 3861 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3862 // `Incoming`. This is analogous to the vectorized case above: extracting the 3863 // second last element when VF > 1. 3864 else if (UF > 1) 3865 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3866 3867 // Fix the initial value of the original recurrence in the scalar loop. 3868 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3869 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3870 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3871 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3872 Start->addIncoming(Incoming, BB); 3873 } 3874 3875 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3876 Phi->setName("scalar.recur"); 3877 3878 // Finally, fix users of the recurrence outside the loop. The users will need 3879 // either the last value of the scalar recurrence or the last value of the 3880 // vector recurrence we extracted in the middle block. Since the loop is in 3881 // LCSSA form, we just need to find all the phi nodes for the original scalar 3882 // recurrence in the exit block, and then add an edge for the middle block. 3883 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3884 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3885 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3886 } 3887 } 3888 } 3889 3890 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3891 Constant *Zero = Builder.getInt32(0); 3892 3893 // Get it's reduction variable descriptor. 3894 assert(Legal->isReductionVariable(Phi) && 3895 "Unable to find the reduction variable"); 3896 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3897 3898 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3899 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3900 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3901 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3902 RdxDesc.getMinMaxRecurrenceKind(); 3903 setDebugLocFromInst(Builder, ReductionStartValue); 3904 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3905 3906 // We need to generate a reduction vector from the incoming scalar. 3907 // To do so, we need to generate the 'identity' vector and override 3908 // one of the elements with the incoming scalar reduction. We need 3909 // to do it in the vector-loop preheader. 3910 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3911 3912 // This is the vector-clone of the value that leaves the loop. 3913 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3914 3915 // Find the reduction identity variable. Zero for addition, or, xor, 3916 // one for multiplication, -1 for And. 3917 Value *Identity; 3918 Value *VectorStart; 3919 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3920 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3921 // MinMax reduction have the start value as their identify. 3922 if (VF == 1 || IsInLoopReductionPhi) { 3923 VectorStart = Identity = ReductionStartValue; 3924 } else { 3925 VectorStart = Identity = 3926 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3927 } 3928 } else { 3929 // Handle other reduction kinds: 3930 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3931 RK, VecTy->getScalarType()); 3932 if (VF == 1 || IsInLoopReductionPhi) { 3933 Identity = Iden; 3934 // This vector is the Identity vector where the first element is the 3935 // incoming scalar reduction. 3936 VectorStart = ReductionStartValue; 3937 } else { 3938 Identity = ConstantVector::getSplat(VF, Iden); 3939 3940 // This vector is the Identity vector where the first element is the 3941 // incoming scalar reduction. 3942 VectorStart = 3943 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3944 } 3945 } 3946 3947 // Wrap flags are in general invalid after vectorization, clear them. 3948 clearReductionWrapFlags(RdxDesc); 3949 3950 // Fix the vector-loop phi. 3951 3952 // Reductions do not have to start at zero. They can start with 3953 // any loop invariant values. 3954 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3955 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3956 3957 for (unsigned Part = 0; Part < UF; ++Part) { 3958 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3959 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3960 // Make sure to add the reduction start value only to the 3961 // first unroll part. 3962 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3963 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3964 cast<PHINode>(VecRdxPhi) 3965 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3966 } 3967 3968 // Before each round, move the insertion point right between 3969 // the PHIs and the values we are going to write. 3970 // This allows us to write both PHINodes and the extractelement 3971 // instructions. 3972 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3973 3974 setDebugLocFromInst(Builder, LoopExitInst); 3975 3976 // If tail is folded by masking, the vector value to leave the loop should be 3977 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3978 // instead of the former. 3979 if (Cost->foldTailByMasking()) { 3980 for (unsigned Part = 0; Part < UF; ++Part) { 3981 Value *VecLoopExitInst = 3982 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3983 Value *Sel = nullptr; 3984 for (User *U : VecLoopExitInst->users()) { 3985 if (isa<SelectInst>(U)) { 3986 assert(!Sel && "Reduction exit feeding two selects"); 3987 Sel = U; 3988 } else 3989 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3990 } 3991 assert(Sel && "Reduction exit feeds no select"); 3992 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3993 3994 // If the target can create a predicated operator for the reduction at no 3995 // extra cost in the loop (for example a predicated vadd), it can be 3996 // cheaper for the select to remain in the loop than be sunk out of it, 3997 // and so use the select value for the phi instead of the old 3998 // LoopExitValue. 3999 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4000 if (PreferPredicatedReductionSelect || 4001 TTI->preferPredicatedReductionSelect( 4002 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4003 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4004 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4005 VecRdxPhi->setIncomingValueForBlock( 4006 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4007 } 4008 } 4009 } 4010 4011 // If the vector reduction can be performed in a smaller type, we truncate 4012 // then extend the loop exit value to enable InstCombine to evaluate the 4013 // entire expression in the smaller type. 4014 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4015 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4016 assert(!VF.Scalable && "scalable vectors not yet supported."); 4017 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4018 Builder.SetInsertPoint( 4019 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4020 VectorParts RdxParts(UF); 4021 for (unsigned Part = 0; Part < UF; ++Part) { 4022 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4023 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4024 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4025 : Builder.CreateZExt(Trunc, VecTy); 4026 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4027 UI != RdxParts[Part]->user_end();) 4028 if (*UI != Trunc) { 4029 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4030 RdxParts[Part] = Extnd; 4031 } else { 4032 ++UI; 4033 } 4034 } 4035 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4036 for (unsigned Part = 0; Part < UF; ++Part) { 4037 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4038 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4039 } 4040 } 4041 4042 // Reduce all of the unrolled parts into a single vector. 4043 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4044 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4045 4046 // The middle block terminator has already been assigned a DebugLoc here (the 4047 // OrigLoop's single latch terminator). We want the whole middle block to 4048 // appear to execute on this line because: (a) it is all compiler generated, 4049 // (b) these instructions are always executed after evaluating the latch 4050 // conditional branch, and (c) other passes may add new predecessors which 4051 // terminate on this line. This is the easiest way to ensure we don't 4052 // accidentally cause an extra step back into the loop while debugging. 4053 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4054 for (unsigned Part = 1; Part < UF; ++Part) { 4055 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4056 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4057 // Floating point operations had to be 'fast' to enable the reduction. 4058 ReducedPartRdx = addFastMathFlag( 4059 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4060 ReducedPartRdx, "bin.rdx"), 4061 RdxDesc.getFastMathFlags()); 4062 else 4063 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4064 RdxPart); 4065 } 4066 4067 // Create the reduction after the loop. Note that inloop reductions create the 4068 // target reduction in the loop using a Reduction recipe. 4069 if (VF.isVector() && !IsInLoopReductionPhi) { 4070 bool NoNaN = Legal->hasFunNoNaNAttr(); 4071 ReducedPartRdx = 4072 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4073 // If the reduction can be performed in a smaller type, we need to extend 4074 // the reduction to the wider type before we branch to the original loop. 4075 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4076 ReducedPartRdx = 4077 RdxDesc.isSigned() 4078 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4079 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4080 } 4081 4082 // Create a phi node that merges control-flow from the backedge-taken check 4083 // block and the middle block. 4084 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4085 LoopScalarPreHeader->getTerminator()); 4086 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4087 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4088 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4089 4090 // Now, we need to fix the users of the reduction variable 4091 // inside and outside of the scalar remainder loop. 4092 // We know that the loop is in LCSSA form. We need to update the 4093 // PHI nodes in the exit blocks. 4094 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4095 // All PHINodes need to have a single entry edge, or two if 4096 // we already fixed them. 4097 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4098 4099 // We found a reduction value exit-PHI. Update it with the 4100 // incoming bypass edge. 4101 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4102 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4103 } // end of the LCSSA phi scan. 4104 4105 // Fix the scalar loop reduction variable with the incoming reduction sum 4106 // from the vector body and from the backedge value. 4107 int IncomingEdgeBlockIdx = 4108 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4109 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4110 // Pick the other block. 4111 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4112 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4113 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4114 } 4115 4116 void InnerLoopVectorizer::clearReductionWrapFlags( 4117 RecurrenceDescriptor &RdxDesc) { 4118 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4119 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4120 RK != RecurrenceDescriptor::RK_IntegerMult) 4121 return; 4122 4123 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4124 assert(LoopExitInstr && "null loop exit instruction"); 4125 SmallVector<Instruction *, 8> Worklist; 4126 SmallPtrSet<Instruction *, 8> Visited; 4127 Worklist.push_back(LoopExitInstr); 4128 Visited.insert(LoopExitInstr); 4129 4130 while (!Worklist.empty()) { 4131 Instruction *Cur = Worklist.pop_back_val(); 4132 if (isa<OverflowingBinaryOperator>(Cur)) 4133 for (unsigned Part = 0; Part < UF; ++Part) { 4134 Value *V = getOrCreateVectorValue(Cur, Part); 4135 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4136 } 4137 4138 for (User *U : Cur->users()) { 4139 Instruction *UI = cast<Instruction>(U); 4140 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4141 Visited.insert(UI).second) 4142 Worklist.push_back(UI); 4143 } 4144 } 4145 } 4146 4147 void InnerLoopVectorizer::fixLCSSAPHIs() { 4148 assert(!VF.Scalable && "the code below assumes fixed width vectors"); 4149 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4150 if (LCSSAPhi.getNumIncomingValues() == 1) { 4151 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4152 // Non-instruction incoming values will have only one value. 4153 unsigned LastLane = 0; 4154 if (isa<Instruction>(IncomingValue)) 4155 LastLane = Cost->isUniformAfterVectorization( 4156 cast<Instruction>(IncomingValue), VF) 4157 ? 0 4158 : VF.Min - 1; 4159 // Can be a loop invariant incoming value or the last scalar value to be 4160 // extracted from the vectorized loop. 4161 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4162 Value *lastIncomingValue = 4163 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4164 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4165 } 4166 } 4167 } 4168 4169 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4170 // The basic block and loop containing the predicated instruction. 4171 auto *PredBB = PredInst->getParent(); 4172 auto *VectorLoop = LI->getLoopFor(PredBB); 4173 4174 // Initialize a worklist with the operands of the predicated instruction. 4175 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4176 4177 // Holds instructions that we need to analyze again. An instruction may be 4178 // reanalyzed if we don't yet know if we can sink it or not. 4179 SmallVector<Instruction *, 8> InstsToReanalyze; 4180 4181 // Returns true if a given use occurs in the predicated block. Phi nodes use 4182 // their operands in their corresponding predecessor blocks. 4183 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4184 auto *I = cast<Instruction>(U.getUser()); 4185 BasicBlock *BB = I->getParent(); 4186 if (auto *Phi = dyn_cast<PHINode>(I)) 4187 BB = Phi->getIncomingBlock( 4188 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4189 return BB == PredBB; 4190 }; 4191 4192 // Iteratively sink the scalarized operands of the predicated instruction 4193 // into the block we created for it. When an instruction is sunk, it's 4194 // operands are then added to the worklist. The algorithm ends after one pass 4195 // through the worklist doesn't sink a single instruction. 4196 bool Changed; 4197 do { 4198 // Add the instructions that need to be reanalyzed to the worklist, and 4199 // reset the changed indicator. 4200 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4201 InstsToReanalyze.clear(); 4202 Changed = false; 4203 4204 while (!Worklist.empty()) { 4205 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4206 4207 // We can't sink an instruction if it is a phi node, is already in the 4208 // predicated block, is not in the loop, or may have side effects. 4209 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4210 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4211 continue; 4212 4213 // It's legal to sink the instruction if all its uses occur in the 4214 // predicated block. Otherwise, there's nothing to do yet, and we may 4215 // need to reanalyze the instruction. 4216 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4217 InstsToReanalyze.push_back(I); 4218 continue; 4219 } 4220 4221 // Move the instruction to the beginning of the predicated block, and add 4222 // it's operands to the worklist. 4223 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4224 Worklist.insert(I->op_begin(), I->op_end()); 4225 4226 // The sinking may have enabled other instructions to be sunk, so we will 4227 // need to iterate. 4228 Changed = true; 4229 } 4230 } while (Changed); 4231 } 4232 4233 void InnerLoopVectorizer::fixNonInductionPHIs() { 4234 for (PHINode *OrigPhi : OrigPHIsToFix) { 4235 PHINode *NewPhi = 4236 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4237 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4238 4239 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4240 predecessors(OrigPhi->getParent())); 4241 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4242 predecessors(NewPhi->getParent())); 4243 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4244 "Scalar and Vector BB should have the same number of predecessors"); 4245 4246 // The insertion point in Builder may be invalidated by the time we get 4247 // here. Force the Builder insertion point to something valid so that we do 4248 // not run into issues during insertion point restore in 4249 // getOrCreateVectorValue calls below. 4250 Builder.SetInsertPoint(NewPhi); 4251 4252 // The predecessor order is preserved and we can rely on mapping between 4253 // scalar and vector block predecessors. 4254 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4255 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4256 4257 // When looking up the new scalar/vector values to fix up, use incoming 4258 // values from original phi. 4259 Value *ScIncV = 4260 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4261 4262 // Scalar incoming value may need a broadcast 4263 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4264 NewPhi->addIncoming(NewIncV, NewPredBB); 4265 } 4266 } 4267 } 4268 4269 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4270 unsigned UF, ElementCount VF, 4271 bool IsPtrLoopInvariant, 4272 SmallBitVector &IsIndexLoopInvariant, 4273 VPTransformState &State) { 4274 // Construct a vector GEP by widening the operands of the scalar GEP as 4275 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4276 // results in a vector of pointers when at least one operand of the GEP 4277 // is vector-typed. Thus, to keep the representation compact, we only use 4278 // vector-typed operands for loop-varying values. 4279 4280 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4281 // If we are vectorizing, but the GEP has only loop-invariant operands, 4282 // the GEP we build (by only using vector-typed operands for 4283 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4284 // produce a vector of pointers, we need to either arbitrarily pick an 4285 // operand to broadcast, or broadcast a clone of the original GEP. 4286 // Here, we broadcast a clone of the original. 4287 // 4288 // TODO: If at some point we decide to scalarize instructions having 4289 // loop-invariant operands, this special case will no longer be 4290 // required. We would add the scalarization decision to 4291 // collectLoopScalars() and teach getVectorValue() to broadcast 4292 // the lane-zero scalar value. 4293 auto *Clone = Builder.Insert(GEP->clone()); 4294 for (unsigned Part = 0; Part < UF; ++Part) { 4295 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4296 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4297 addMetadata(EntryPart, GEP); 4298 } 4299 } else { 4300 // If the GEP has at least one loop-varying operand, we are sure to 4301 // produce a vector of pointers. But if we are only unrolling, we want 4302 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4303 // produce with the code below will be scalar (if VF == 1) or vector 4304 // (otherwise). Note that for the unroll-only case, we still maintain 4305 // values in the vector mapping with initVector, as we do for other 4306 // instructions. 4307 for (unsigned Part = 0; Part < UF; ++Part) { 4308 // The pointer operand of the new GEP. If it's loop-invariant, we 4309 // won't broadcast it. 4310 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4311 : State.get(Operands.getOperand(0), Part); 4312 4313 // Collect all the indices for the new GEP. If any index is 4314 // loop-invariant, we won't broadcast it. 4315 SmallVector<Value *, 4> Indices; 4316 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4317 VPValue *Operand = Operands.getOperand(I); 4318 if (IsIndexLoopInvariant[I - 1]) 4319 Indices.push_back(State.get(Operand, {0, 0})); 4320 else 4321 Indices.push_back(State.get(Operand, Part)); 4322 } 4323 4324 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4325 // but it should be a vector, otherwise. 4326 auto *NewGEP = 4327 GEP->isInBounds() 4328 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4329 Indices) 4330 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4331 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4332 "NewGEP is not a pointer vector"); 4333 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4334 addMetadata(NewGEP, GEP); 4335 } 4336 } 4337 } 4338 4339 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4340 ElementCount VF) { 4341 assert(!VF.Scalable && "scalable vectors not yet supported."); 4342 PHINode *P = cast<PHINode>(PN); 4343 if (EnableVPlanNativePath) { 4344 // Currently we enter here in the VPlan-native path for non-induction 4345 // PHIs where all control flow is uniform. We simply widen these PHIs. 4346 // Create a vector phi with no operands - the vector phi operands will be 4347 // set at the end of vector code generation. 4348 Type *VecTy = 4349 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4350 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4351 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4352 OrigPHIsToFix.push_back(P); 4353 4354 return; 4355 } 4356 4357 assert(PN->getParent() == OrigLoop->getHeader() && 4358 "Non-header phis should have been handled elsewhere"); 4359 4360 // In order to support recurrences we need to be able to vectorize Phi nodes. 4361 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4362 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4363 // this value when we vectorize all of the instructions that use the PHI. 4364 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4365 for (unsigned Part = 0; Part < UF; ++Part) { 4366 // This is phase one of vectorizing PHIs. 4367 bool ScalarPHI = 4368 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4369 Type *VecTy = 4370 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4371 Value *EntryPart = PHINode::Create( 4372 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4373 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4374 } 4375 return; 4376 } 4377 4378 setDebugLocFromInst(Builder, P); 4379 4380 // This PHINode must be an induction variable. 4381 // Make sure that we know about it. 4382 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4383 4384 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4385 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4386 4387 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4388 // which can be found from the original scalar operations. 4389 switch (II.getKind()) { 4390 case InductionDescriptor::IK_NoInduction: 4391 llvm_unreachable("Unknown induction"); 4392 case InductionDescriptor::IK_IntInduction: 4393 case InductionDescriptor::IK_FpInduction: 4394 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4395 case InductionDescriptor::IK_PtrInduction: { 4396 // Handle the pointer induction variable case. 4397 assert(P->getType()->isPointerTy() && "Unexpected type."); 4398 4399 if (Cost->isScalarAfterVectorization(P, VF)) { 4400 // This is the normalized GEP that starts counting at zero. 4401 Value *PtrInd = 4402 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4403 // Determine the number of scalars we need to generate for each unroll 4404 // iteration. If the instruction is uniform, we only need to generate the 4405 // first lane. Otherwise, we generate all VF values. 4406 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; 4407 for (unsigned Part = 0; Part < UF; ++Part) { 4408 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4409 Constant *Idx = 4410 ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); 4411 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4412 Value *SclrGep = 4413 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4414 SclrGep->setName("next.gep"); 4415 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4416 } 4417 } 4418 return; 4419 } 4420 assert(isa<SCEVConstant>(II.getStep()) && 4421 "Induction step not a SCEV constant!"); 4422 Type *PhiType = II.getStep()->getType(); 4423 4424 // Build a pointer phi 4425 Value *ScalarStartValue = II.getStartValue(); 4426 Type *ScStValueType = ScalarStartValue->getType(); 4427 PHINode *NewPointerPhi = 4428 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4429 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4430 4431 // A pointer induction, performed by using a gep 4432 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4433 Instruction *InductionLoc = LoopLatch->getTerminator(); 4434 const SCEV *ScalarStep = II.getStep(); 4435 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4436 Value *ScalarStepValue = 4437 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4438 Value *InductionGEP = GetElementPtrInst::Create( 4439 ScStValueType->getPointerElementType(), NewPointerPhi, 4440 Builder.CreateMul(ScalarStepValue, 4441 ConstantInt::get(PhiType, VF.Min * UF)), 4442 "ptr.ind", InductionLoc); 4443 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4444 4445 // Create UF many actual address geps that use the pointer 4446 // phi as base and a vectorized version of the step value 4447 // (<step*0, ..., step*N>) as offset. 4448 for (unsigned Part = 0; Part < UF; ++Part) { 4449 SmallVector<Constant *, 8> Indices; 4450 // Create a vector of consecutive numbers from zero to VF. 4451 for (unsigned i = 0; i < VF.Min; ++i) 4452 Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); 4453 Constant *StartOffset = ConstantVector::get(Indices); 4454 4455 Value *GEP = Builder.CreateGEP( 4456 ScStValueType->getPointerElementType(), NewPointerPhi, 4457 Builder.CreateMul(StartOffset, 4458 Builder.CreateVectorSplat(VF.Min, ScalarStepValue), 4459 "vector.gep")); 4460 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4461 } 4462 } 4463 } 4464 } 4465 4466 /// A helper function for checking whether an integer division-related 4467 /// instruction may divide by zero (in which case it must be predicated if 4468 /// executed conditionally in the scalar code). 4469 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4470 /// Non-zero divisors that are non compile-time constants will not be 4471 /// converted into multiplication, so we will still end up scalarizing 4472 /// the division, but can do so w/o predication. 4473 static bool mayDivideByZero(Instruction &I) { 4474 assert((I.getOpcode() == Instruction::UDiv || 4475 I.getOpcode() == Instruction::SDiv || 4476 I.getOpcode() == Instruction::URem || 4477 I.getOpcode() == Instruction::SRem) && 4478 "Unexpected instruction"); 4479 Value *Divisor = I.getOperand(1); 4480 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4481 return !CInt || CInt->isZero(); 4482 } 4483 4484 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4485 VPTransformState &State) { 4486 assert(!VF.Scalable && "scalable vectors not yet supported."); 4487 switch (I.getOpcode()) { 4488 case Instruction::Call: 4489 case Instruction::Br: 4490 case Instruction::PHI: 4491 case Instruction::GetElementPtr: 4492 case Instruction::Select: 4493 llvm_unreachable("This instruction is handled by a different recipe."); 4494 case Instruction::UDiv: 4495 case Instruction::SDiv: 4496 case Instruction::SRem: 4497 case Instruction::URem: 4498 case Instruction::Add: 4499 case Instruction::FAdd: 4500 case Instruction::Sub: 4501 case Instruction::FSub: 4502 case Instruction::FNeg: 4503 case Instruction::Mul: 4504 case Instruction::FMul: 4505 case Instruction::FDiv: 4506 case Instruction::FRem: 4507 case Instruction::Shl: 4508 case Instruction::LShr: 4509 case Instruction::AShr: 4510 case Instruction::And: 4511 case Instruction::Or: 4512 case Instruction::Xor: { 4513 // Just widen unops and binops. 4514 setDebugLocFromInst(Builder, &I); 4515 4516 for (unsigned Part = 0; Part < UF; ++Part) { 4517 SmallVector<Value *, 2> Ops; 4518 for (VPValue *VPOp : User.operands()) 4519 Ops.push_back(State.get(VPOp, Part)); 4520 4521 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4522 4523 if (auto *VecOp = dyn_cast<Instruction>(V)) 4524 VecOp->copyIRFlags(&I); 4525 4526 // Use this vector value for all users of the original instruction. 4527 VectorLoopValueMap.setVectorValue(&I, Part, V); 4528 addMetadata(V, &I); 4529 } 4530 4531 break; 4532 } 4533 case Instruction::ICmp: 4534 case Instruction::FCmp: { 4535 // Widen compares. Generate vector compares. 4536 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4537 auto *Cmp = cast<CmpInst>(&I); 4538 setDebugLocFromInst(Builder, Cmp); 4539 for (unsigned Part = 0; Part < UF; ++Part) { 4540 Value *A = State.get(User.getOperand(0), Part); 4541 Value *B = State.get(User.getOperand(1), Part); 4542 Value *C = nullptr; 4543 if (FCmp) { 4544 // Propagate fast math flags. 4545 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4546 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4547 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4548 } else { 4549 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4550 } 4551 VectorLoopValueMap.setVectorValue(&I, Part, C); 4552 addMetadata(C, &I); 4553 } 4554 4555 break; 4556 } 4557 4558 case Instruction::ZExt: 4559 case Instruction::SExt: 4560 case Instruction::FPToUI: 4561 case Instruction::FPToSI: 4562 case Instruction::FPExt: 4563 case Instruction::PtrToInt: 4564 case Instruction::IntToPtr: 4565 case Instruction::SIToFP: 4566 case Instruction::UIToFP: 4567 case Instruction::Trunc: 4568 case Instruction::FPTrunc: 4569 case Instruction::BitCast: { 4570 auto *CI = cast<CastInst>(&I); 4571 setDebugLocFromInst(Builder, CI); 4572 4573 /// Vectorize casts. 4574 assert(!VF.Scalable && "VF is assumed to be non scalable."); 4575 Type *DestTy = 4576 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4577 4578 for (unsigned Part = 0; Part < UF; ++Part) { 4579 Value *A = State.get(User.getOperand(0), Part); 4580 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4581 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4582 addMetadata(Cast, &I); 4583 } 4584 break; 4585 } 4586 default: 4587 // This instruction is not vectorized by simple widening. 4588 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4589 llvm_unreachable("Unhandled instruction!"); 4590 } // end of switch. 4591 } 4592 4593 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4594 VPTransformState &State) { 4595 assert(!isa<DbgInfoIntrinsic>(I) && 4596 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4597 setDebugLocFromInst(Builder, &I); 4598 4599 Module *M = I.getParent()->getParent()->getParent(); 4600 auto *CI = cast<CallInst>(&I); 4601 4602 SmallVector<Type *, 4> Tys; 4603 for (Value *ArgOperand : CI->arg_operands()) 4604 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); 4605 4606 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4607 4608 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4609 // version of the instruction. 4610 // Is it beneficial to perform intrinsic call compared to lib call? 4611 bool NeedToScalarize = false; 4612 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4613 bool UseVectorIntrinsic = 4614 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4615 assert((UseVectorIntrinsic || !NeedToScalarize) && 4616 "Instruction should be scalarized elsewhere."); 4617 4618 for (unsigned Part = 0; Part < UF; ++Part) { 4619 SmallVector<Value *, 4> Args; 4620 for (auto &I : enumerate(ArgOperands.operands())) { 4621 // Some intrinsics have a scalar argument - don't replace it with a 4622 // vector. 4623 Value *Arg; 4624 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4625 Arg = State.get(I.value(), Part); 4626 else 4627 Arg = State.get(I.value(), {0, 0}); 4628 Args.push_back(Arg); 4629 } 4630 4631 Function *VectorF; 4632 if (UseVectorIntrinsic) { 4633 // Use vector version of the intrinsic. 4634 Type *TysForDecl[] = {CI->getType()}; 4635 if (VF.isVector()) { 4636 assert(!VF.Scalable && "VF is assumed to be non scalable."); 4637 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4638 } 4639 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4640 assert(VectorF && "Can't retrieve vector intrinsic."); 4641 } else { 4642 // Use vector version of the function call. 4643 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4644 #ifndef NDEBUG 4645 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4646 "Can't create vector function."); 4647 #endif 4648 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4649 } 4650 SmallVector<OperandBundleDef, 1> OpBundles; 4651 CI->getOperandBundlesAsDefs(OpBundles); 4652 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4653 4654 if (isa<FPMathOperator>(V)) 4655 V->copyFastMathFlags(CI); 4656 4657 VectorLoopValueMap.setVectorValue(&I, Part, V); 4658 addMetadata(V, &I); 4659 } 4660 } 4661 4662 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4663 VPUser &Operands, 4664 bool InvariantCond, 4665 VPTransformState &State) { 4666 setDebugLocFromInst(Builder, &I); 4667 4668 // The condition can be loop invariant but still defined inside the 4669 // loop. This means that we can't just use the original 'cond' value. 4670 // We have to take the 'vectorized' value and pick the first lane. 4671 // Instcombine will make this a no-op. 4672 auto *InvarCond = 4673 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4674 4675 for (unsigned Part = 0; Part < UF; ++Part) { 4676 Value *Cond = 4677 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4678 Value *Op0 = State.get(Operands.getOperand(1), Part); 4679 Value *Op1 = State.get(Operands.getOperand(2), Part); 4680 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4681 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4682 addMetadata(Sel, &I); 4683 } 4684 } 4685 4686 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4687 // We should not collect Scalars more than once per VF. Right now, this 4688 // function is called from collectUniformsAndScalars(), which already does 4689 // this check. Collecting Scalars for VF=1 does not make any sense. 4690 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4691 "This function should not be visited twice for the same VF"); 4692 4693 SmallSetVector<Instruction *, 8> Worklist; 4694 4695 // These sets are used to seed the analysis with pointers used by memory 4696 // accesses that will remain scalar. 4697 SmallSetVector<Instruction *, 8> ScalarPtrs; 4698 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4699 auto *Latch = TheLoop->getLoopLatch(); 4700 4701 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4702 // The pointer operands of loads and stores will be scalar as long as the 4703 // memory access is not a gather or scatter operation. The value operand of a 4704 // store will remain scalar if the store is scalarized. 4705 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4706 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4707 assert(WideningDecision != CM_Unknown && 4708 "Widening decision should be ready at this moment"); 4709 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4710 if (Ptr == Store->getValueOperand()) 4711 return WideningDecision == CM_Scalarize; 4712 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4713 "Ptr is neither a value or pointer operand"); 4714 return WideningDecision != CM_GatherScatter; 4715 }; 4716 4717 // A helper that returns true if the given value is a bitcast or 4718 // getelementptr instruction contained in the loop. 4719 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4720 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4721 isa<GetElementPtrInst>(V)) && 4722 !TheLoop->isLoopInvariant(V); 4723 }; 4724 4725 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4726 if (!isa<PHINode>(Ptr) || 4727 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4728 return false; 4729 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4730 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4731 return false; 4732 return isScalarUse(MemAccess, Ptr); 4733 }; 4734 4735 // A helper that evaluates a memory access's use of a pointer. If the 4736 // pointer is actually the pointer induction of a loop, it is being 4737 // inserted into Worklist. If the use will be a scalar use, and the 4738 // pointer is only used by memory accesses, we place the pointer in 4739 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4740 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4741 if (isScalarPtrInduction(MemAccess, Ptr)) { 4742 Worklist.insert(cast<Instruction>(Ptr)); 4743 Instruction *Update = cast<Instruction>( 4744 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4745 Worklist.insert(Update); 4746 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4747 << "\n"); 4748 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4749 << "\n"); 4750 return; 4751 } 4752 // We only care about bitcast and getelementptr instructions contained in 4753 // the loop. 4754 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4755 return; 4756 4757 // If the pointer has already been identified as scalar (e.g., if it was 4758 // also identified as uniform), there's nothing to do. 4759 auto *I = cast<Instruction>(Ptr); 4760 if (Worklist.count(I)) 4761 return; 4762 4763 // If the use of the pointer will be a scalar use, and all users of the 4764 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4765 // place the pointer in PossibleNonScalarPtrs. 4766 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4767 return isa<LoadInst>(U) || isa<StoreInst>(U); 4768 })) 4769 ScalarPtrs.insert(I); 4770 else 4771 PossibleNonScalarPtrs.insert(I); 4772 }; 4773 4774 // We seed the scalars analysis with three classes of instructions: (1) 4775 // instructions marked uniform-after-vectorization and (2) bitcast, 4776 // getelementptr and (pointer) phi instructions used by memory accesses 4777 // requiring a scalar use. 4778 // 4779 // (1) Add to the worklist all instructions that have been identified as 4780 // uniform-after-vectorization. 4781 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4782 4783 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4784 // memory accesses requiring a scalar use. The pointer operands of loads and 4785 // stores will be scalar as long as the memory accesses is not a gather or 4786 // scatter operation. The value operand of a store will remain scalar if the 4787 // store is scalarized. 4788 for (auto *BB : TheLoop->blocks()) 4789 for (auto &I : *BB) { 4790 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4791 evaluatePtrUse(Load, Load->getPointerOperand()); 4792 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4793 evaluatePtrUse(Store, Store->getPointerOperand()); 4794 evaluatePtrUse(Store, Store->getValueOperand()); 4795 } 4796 } 4797 for (auto *I : ScalarPtrs) 4798 if (!PossibleNonScalarPtrs.count(I)) { 4799 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4800 Worklist.insert(I); 4801 } 4802 4803 // Insert the forced scalars. 4804 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4805 // induction variable when the PHI user is scalarized. 4806 auto ForcedScalar = ForcedScalars.find(VF); 4807 if (ForcedScalar != ForcedScalars.end()) 4808 for (auto *I : ForcedScalar->second) 4809 Worklist.insert(I); 4810 4811 // Expand the worklist by looking through any bitcasts and getelementptr 4812 // instructions we've already identified as scalar. This is similar to the 4813 // expansion step in collectLoopUniforms(); however, here we're only 4814 // expanding to include additional bitcasts and getelementptr instructions. 4815 unsigned Idx = 0; 4816 while (Idx != Worklist.size()) { 4817 Instruction *Dst = Worklist[Idx++]; 4818 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4819 continue; 4820 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4821 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4822 auto *J = cast<Instruction>(U); 4823 return !TheLoop->contains(J) || Worklist.count(J) || 4824 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4825 isScalarUse(J, Src)); 4826 })) { 4827 Worklist.insert(Src); 4828 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4829 } 4830 } 4831 4832 // An induction variable will remain scalar if all users of the induction 4833 // variable and induction variable update remain scalar. 4834 for (auto &Induction : Legal->getInductionVars()) { 4835 auto *Ind = Induction.first; 4836 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4837 4838 // If tail-folding is applied, the primary induction variable will be used 4839 // to feed a vector compare. 4840 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4841 continue; 4842 4843 // Determine if all users of the induction variable are scalar after 4844 // vectorization. 4845 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4846 auto *I = cast<Instruction>(U); 4847 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4848 }); 4849 if (!ScalarInd) 4850 continue; 4851 4852 // Determine if all users of the induction variable update instruction are 4853 // scalar after vectorization. 4854 auto ScalarIndUpdate = 4855 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4856 auto *I = cast<Instruction>(U); 4857 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4858 }); 4859 if (!ScalarIndUpdate) 4860 continue; 4861 4862 // The induction variable and its update instruction will remain scalar. 4863 Worklist.insert(Ind); 4864 Worklist.insert(IndUpdate); 4865 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4866 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4867 << "\n"); 4868 } 4869 4870 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4871 } 4872 4873 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4874 ElementCount VF) { 4875 assert(!VF.Scalable && "scalable vectors not yet supported."); 4876 if (!blockNeedsPredication(I->getParent())) 4877 return false; 4878 switch(I->getOpcode()) { 4879 default: 4880 break; 4881 case Instruction::Load: 4882 case Instruction::Store: { 4883 if (!Legal->isMaskRequired(I)) 4884 return false; 4885 auto *Ptr = getLoadStorePointerOperand(I); 4886 auto *Ty = getMemInstValueType(I); 4887 // We have already decided how to vectorize this instruction, get that 4888 // result. 4889 if (VF.isVector()) { 4890 InstWidening WideningDecision = getWideningDecision(I, VF); 4891 assert(WideningDecision != CM_Unknown && 4892 "Widening decision should be ready at this moment"); 4893 return WideningDecision == CM_Scalarize; 4894 } 4895 const Align Alignment = getLoadStoreAlignment(I); 4896 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4897 isLegalMaskedGather(Ty, Alignment)) 4898 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4899 isLegalMaskedScatter(Ty, Alignment)); 4900 } 4901 case Instruction::UDiv: 4902 case Instruction::SDiv: 4903 case Instruction::SRem: 4904 case Instruction::URem: 4905 return mayDivideByZero(*I); 4906 } 4907 return false; 4908 } 4909 4910 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4911 Instruction *I, ElementCount VF) { 4912 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4913 assert(getWideningDecision(I, VF) == CM_Unknown && 4914 "Decision should not be set yet."); 4915 auto *Group = getInterleavedAccessGroup(I); 4916 assert(Group && "Must have a group."); 4917 4918 // If the instruction's allocated size doesn't equal it's type size, it 4919 // requires padding and will be scalarized. 4920 auto &DL = I->getModule()->getDataLayout(); 4921 auto *ScalarTy = getMemInstValueType(I); 4922 if (hasIrregularType(ScalarTy, DL, VF)) 4923 return false; 4924 4925 // Check if masking is required. 4926 // A Group may need masking for one of two reasons: it resides in a block that 4927 // needs predication, or it was decided to use masking to deal with gaps. 4928 bool PredicatedAccessRequiresMasking = 4929 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4930 bool AccessWithGapsRequiresMasking = 4931 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4932 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4933 return true; 4934 4935 // If masked interleaving is required, we expect that the user/target had 4936 // enabled it, because otherwise it either wouldn't have been created or 4937 // it should have been invalidated by the CostModel. 4938 assert(useMaskedInterleavedAccesses(TTI) && 4939 "Masked interleave-groups for predicated accesses are not enabled."); 4940 4941 auto *Ty = getMemInstValueType(I); 4942 const Align Alignment = getLoadStoreAlignment(I); 4943 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4944 : TTI.isLegalMaskedStore(Ty, Alignment); 4945 } 4946 4947 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4948 Instruction *I, ElementCount VF) { 4949 // Get and ensure we have a valid memory instruction. 4950 LoadInst *LI = dyn_cast<LoadInst>(I); 4951 StoreInst *SI = dyn_cast<StoreInst>(I); 4952 assert((LI || SI) && "Invalid memory instruction"); 4953 4954 auto *Ptr = getLoadStorePointerOperand(I); 4955 4956 // In order to be widened, the pointer should be consecutive, first of all. 4957 if (!Legal->isConsecutivePtr(Ptr)) 4958 return false; 4959 4960 // If the instruction is a store located in a predicated block, it will be 4961 // scalarized. 4962 if (isScalarWithPredication(I)) 4963 return false; 4964 4965 // If the instruction's allocated size doesn't equal it's type size, it 4966 // requires padding and will be scalarized. 4967 auto &DL = I->getModule()->getDataLayout(); 4968 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4969 if (hasIrregularType(ScalarTy, DL, VF)) 4970 return false; 4971 4972 return true; 4973 } 4974 4975 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4976 // We should not collect Uniforms more than once per VF. Right now, 4977 // this function is called from collectUniformsAndScalars(), which 4978 // already does this check. Collecting Uniforms for VF=1 does not make any 4979 // sense. 4980 4981 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4982 "This function should not be visited twice for the same VF"); 4983 4984 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4985 // not analyze again. Uniforms.count(VF) will return 1. 4986 Uniforms[VF].clear(); 4987 4988 // We now know that the loop is vectorizable! 4989 // Collect instructions inside the loop that will remain uniform after 4990 // vectorization. 4991 4992 // Global values, params and instructions outside of current loop are out of 4993 // scope. 4994 auto isOutOfScope = [&](Value *V) -> bool { 4995 Instruction *I = dyn_cast<Instruction>(V); 4996 return (!I || !TheLoop->contains(I)); 4997 }; 4998 4999 SetVector<Instruction *> Worklist; 5000 BasicBlock *Latch = TheLoop->getLoopLatch(); 5001 5002 // Instructions that are scalar with predication must not be considered 5003 // uniform after vectorization, because that would create an erroneous 5004 // replicating region where only a single instance out of VF should be formed. 5005 // TODO: optimize such seldom cases if found important, see PR40816. 5006 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5007 if (isScalarWithPredication(I, VF)) { 5008 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5009 << *I << "\n"); 5010 return; 5011 } 5012 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5013 Worklist.insert(I); 5014 }; 5015 5016 // Start with the conditional branch. If the branch condition is an 5017 // instruction contained in the loop that is only used by the branch, it is 5018 // uniform. 5019 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5020 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5021 addToWorklistIfAllowed(Cmp); 5022 5023 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5024 // are pointers that are treated like consecutive pointers during 5025 // vectorization. The pointer operands of interleaved accesses are an 5026 // example. 5027 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5028 5029 // Holds pointer operands of instructions that are possibly non-uniform. 5030 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5031 5032 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5033 InstWidening WideningDecision = getWideningDecision(I, VF); 5034 assert(WideningDecision != CM_Unknown && 5035 "Widening decision should be ready at this moment"); 5036 5037 return (WideningDecision == CM_Widen || 5038 WideningDecision == CM_Widen_Reverse || 5039 WideningDecision == CM_Interleave); 5040 }; 5041 // Iterate over the instructions in the loop, and collect all 5042 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5043 // that a consecutive-like pointer operand will be scalarized, we collect it 5044 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5045 // getelementptr instruction can be used by both vectorized and scalarized 5046 // memory instructions. For example, if a loop loads and stores from the same 5047 // location, but the store is conditional, the store will be scalarized, and 5048 // the getelementptr won't remain uniform. 5049 for (auto *BB : TheLoop->blocks()) 5050 for (auto &I : *BB) { 5051 // If there's no pointer operand, there's nothing to do. 5052 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5053 if (!Ptr) 5054 continue; 5055 5056 // True if all users of Ptr are memory accesses that have Ptr as their 5057 // pointer operand. 5058 auto UsersAreMemAccesses = 5059 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5060 return getLoadStorePointerOperand(U) == Ptr; 5061 }); 5062 5063 // Ensure the memory instruction will not be scalarized or used by 5064 // gather/scatter, making its pointer operand non-uniform. If the pointer 5065 // operand is used by any instruction other than a memory access, we 5066 // conservatively assume the pointer operand may be non-uniform. 5067 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5068 PossibleNonUniformPtrs.insert(Ptr); 5069 5070 // If the memory instruction will be vectorized and its pointer operand 5071 // is consecutive-like, or interleaving - the pointer operand should 5072 // remain uniform. 5073 else 5074 ConsecutiveLikePtrs.insert(Ptr); 5075 } 5076 5077 // Add to the Worklist all consecutive and consecutive-like pointers that 5078 // aren't also identified as possibly non-uniform. 5079 for (auto *V : ConsecutiveLikePtrs) 5080 if (!PossibleNonUniformPtrs.count(V)) 5081 addToWorklistIfAllowed(V); 5082 5083 // Expand Worklist in topological order: whenever a new instruction 5084 // is added , its users should be already inside Worklist. It ensures 5085 // a uniform instruction will only be used by uniform instructions. 5086 unsigned idx = 0; 5087 while (idx != Worklist.size()) { 5088 Instruction *I = Worklist[idx++]; 5089 5090 for (auto OV : I->operand_values()) { 5091 // isOutOfScope operands cannot be uniform instructions. 5092 if (isOutOfScope(OV)) 5093 continue; 5094 // First order recurrence Phi's should typically be considered 5095 // non-uniform. 5096 auto *OP = dyn_cast<PHINode>(OV); 5097 if (OP && Legal->isFirstOrderRecurrence(OP)) 5098 continue; 5099 // If all the users of the operand are uniform, then add the 5100 // operand into the uniform worklist. 5101 auto *OI = cast<Instruction>(OV); 5102 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5103 auto *J = cast<Instruction>(U); 5104 return Worklist.count(J) || 5105 (OI == getLoadStorePointerOperand(J) && 5106 isUniformDecision(J, VF)); 5107 })) 5108 addToWorklistIfAllowed(OI); 5109 } 5110 } 5111 5112 // Returns true if Ptr is the pointer operand of a memory access instruction 5113 // I, and I is known to not require scalarization. 5114 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5115 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5116 }; 5117 5118 // For an instruction to be added into Worklist above, all its users inside 5119 // the loop should also be in Worklist. However, this condition cannot be 5120 // true for phi nodes that form a cyclic dependence. We must process phi 5121 // nodes separately. An induction variable will remain uniform if all users 5122 // of the induction variable and induction variable update remain uniform. 5123 // The code below handles both pointer and non-pointer induction variables. 5124 for (auto &Induction : Legal->getInductionVars()) { 5125 auto *Ind = Induction.first; 5126 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5127 5128 // Determine if all users of the induction variable are uniform after 5129 // vectorization. 5130 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5131 auto *I = cast<Instruction>(U); 5132 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5133 isVectorizedMemAccessUse(I, Ind); 5134 }); 5135 if (!UniformInd) 5136 continue; 5137 5138 // Determine if all users of the induction variable update instruction are 5139 // uniform after vectorization. 5140 auto UniformIndUpdate = 5141 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5142 auto *I = cast<Instruction>(U); 5143 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5144 isVectorizedMemAccessUse(I, IndUpdate); 5145 }); 5146 if (!UniformIndUpdate) 5147 continue; 5148 5149 // The induction variable and its update instruction will remain uniform. 5150 addToWorklistIfAllowed(Ind); 5151 addToWorklistIfAllowed(IndUpdate); 5152 } 5153 5154 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5155 } 5156 5157 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5158 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5159 5160 if (Legal->getRuntimePointerChecking()->Need) { 5161 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5162 "runtime pointer checks needed. Enable vectorization of this " 5163 "loop with '#pragma clang loop vectorize(enable)' when " 5164 "compiling with -Os/-Oz", 5165 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5166 return true; 5167 } 5168 5169 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5170 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5171 "runtime SCEV checks needed. Enable vectorization of this " 5172 "loop with '#pragma clang loop vectorize(enable)' when " 5173 "compiling with -Os/-Oz", 5174 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5175 return true; 5176 } 5177 5178 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5179 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5180 reportVectorizationFailure("Runtime stride check for small trip count", 5181 "runtime stride == 1 checks needed. Enable vectorization of " 5182 "this loop without such check by compiling with -Os/-Oz", 5183 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5184 return true; 5185 } 5186 5187 return false; 5188 } 5189 5190 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5191 unsigned UserIC) { 5192 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5193 // TODO: It may by useful to do since it's still likely to be dynamically 5194 // uniform if the target can skip. 5195 reportVectorizationFailure( 5196 "Not inserting runtime ptr check for divergent target", 5197 "runtime pointer checks needed. Not enabled for divergent target", 5198 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5199 return None; 5200 } 5201 5202 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5203 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5204 if (TC == 1) { 5205 reportVectorizationFailure("Single iteration (non) loop", 5206 "loop trip count is one, irrelevant for vectorization", 5207 "SingleIterationLoop", ORE, TheLoop); 5208 return None; 5209 } 5210 5211 switch (ScalarEpilogueStatus) { 5212 case CM_ScalarEpilogueAllowed: 5213 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5214 case CM_ScalarEpilogueNotNeededUsePredicate: 5215 LLVM_DEBUG( 5216 dbgs() << "LV: vector predicate hint/switch found.\n" 5217 << "LV: Not allowing scalar epilogue, creating predicated " 5218 << "vector loop.\n"); 5219 break; 5220 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5221 // fallthrough as a special case of OptForSize 5222 case CM_ScalarEpilogueNotAllowedOptSize: 5223 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5224 LLVM_DEBUG( 5225 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5226 else 5227 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5228 << "count.\n"); 5229 5230 // Bail if runtime checks are required, which are not good when optimising 5231 // for size. 5232 if (runtimeChecksRequired()) 5233 return None; 5234 break; 5235 } 5236 5237 // Now try the tail folding 5238 5239 // Invalidate interleave groups that require an epilogue if we can't mask 5240 // the interleave-group. 5241 if (!useMaskedInterleavedAccesses(TTI)) { 5242 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5243 "No decisions should have been taken at this point"); 5244 // Note: There is no need to invalidate any cost modeling decisions here, as 5245 // non where taken so far. 5246 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5247 } 5248 5249 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5250 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5251 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5252 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5253 // Accept MaxVF if we do not have a tail. 5254 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5255 return MaxVF; 5256 } 5257 5258 // If we don't know the precise trip count, or if the trip count that we 5259 // found modulo the vectorization factor is not zero, try to fold the tail 5260 // by masking. 5261 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5262 if (Legal->prepareToFoldTailByMasking()) { 5263 FoldTailByMasking = true; 5264 return MaxVF; 5265 } 5266 5267 // If there was a tail-folding hint/switch, but we can't fold the tail by 5268 // masking, fallback to a vectorization with a scalar epilogue. 5269 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5270 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5271 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5272 return None; 5273 } 5274 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5275 "scalar epilogue instead.\n"); 5276 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5277 return MaxVF; 5278 } 5279 5280 if (TC == 0) { 5281 reportVectorizationFailure( 5282 "Unable to calculate the loop count due to complex control flow", 5283 "unable to calculate the loop count due to complex control flow", 5284 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5285 return None; 5286 } 5287 5288 reportVectorizationFailure( 5289 "Cannot optimize for size and vectorize at the same time.", 5290 "cannot optimize for size and vectorize at the same time. " 5291 "Enable vectorization of this loop with '#pragma clang loop " 5292 "vectorize(enable)' when compiling with -Os/-Oz", 5293 "NoTailLoopWithOptForSize", ORE, TheLoop); 5294 return None; 5295 } 5296 5297 unsigned 5298 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5299 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5300 unsigned SmallestType, WidestType; 5301 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5302 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5303 5304 // Get the maximum safe dependence distance in bits computed by LAA. 5305 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5306 // the memory accesses that is most restrictive (involved in the smallest 5307 // dependence distance). 5308 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5309 5310 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5311 5312 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5313 // Note that both WidestRegister and WidestType may not be a powers of 2. 5314 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5315 5316 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5317 << " / " << WidestType << " bits.\n"); 5318 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5319 << WidestRegister << " bits.\n"); 5320 5321 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5322 " into one vector!"); 5323 if (MaxVectorSize == 0) { 5324 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5325 MaxVectorSize = 1; 5326 return MaxVectorSize; 5327 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5328 isPowerOf2_32(ConstTripCount)) { 5329 // We need to clamp the VF to be the ConstTripCount. There is no point in 5330 // choosing a higher viable VF as done in the loop below. 5331 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5332 << ConstTripCount << "\n"); 5333 MaxVectorSize = ConstTripCount; 5334 return MaxVectorSize; 5335 } 5336 5337 unsigned MaxVF = MaxVectorSize; 5338 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5339 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5340 // Collect all viable vectorization factors larger than the default MaxVF 5341 // (i.e. MaxVectorSize). 5342 SmallVector<ElementCount, 8> VFs; 5343 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5344 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5345 VFs.push_back(ElementCount::getFixed(VS)); 5346 5347 // For each VF calculate its register usage. 5348 auto RUs = calculateRegisterUsage(VFs); 5349 5350 // Select the largest VF which doesn't require more registers than existing 5351 // ones. 5352 for (int i = RUs.size() - 1; i >= 0; --i) { 5353 bool Selected = true; 5354 for (auto& pair : RUs[i].MaxLocalUsers) { 5355 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5356 if (pair.second > TargetNumRegisters) 5357 Selected = false; 5358 } 5359 if (Selected) { 5360 MaxVF = VFs[i].Min; 5361 break; 5362 } 5363 } 5364 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5365 if (MaxVF < MinVF) { 5366 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5367 << ") with target's minimum: " << MinVF << '\n'); 5368 MaxVF = MinVF; 5369 } 5370 } 5371 } 5372 return MaxVF; 5373 } 5374 5375 VectorizationFactor 5376 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5377 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5378 const float ScalarCost = Cost; 5379 unsigned Width = 1; 5380 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5381 5382 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5383 if (ForceVectorization && MaxVF > 1) { 5384 // Ignore scalar width, because the user explicitly wants vectorization. 5385 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5386 // evaluation. 5387 Cost = std::numeric_limits<float>::max(); 5388 } 5389 5390 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5391 // Notice that the vector loop needs to be executed less times, so 5392 // we need to divide the cost of the vector loops by the width of 5393 // the vector elements. 5394 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5395 float VectorCost = C.first / (float)i; 5396 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5397 << " costs: " << (int)VectorCost << ".\n"); 5398 if (!C.second && !ForceVectorization) { 5399 LLVM_DEBUG( 5400 dbgs() << "LV: Not considering vector loop of width " << i 5401 << " because it will not generate any vector instructions.\n"); 5402 continue; 5403 } 5404 if (VectorCost < Cost) { 5405 Cost = VectorCost; 5406 Width = i; 5407 } 5408 } 5409 5410 if (!EnableCondStoresVectorization && NumPredStores) { 5411 reportVectorizationFailure("There are conditional stores.", 5412 "store that is conditionally executed prevents vectorization", 5413 "ConditionalStore", ORE, TheLoop); 5414 Width = 1; 5415 Cost = ScalarCost; 5416 } 5417 5418 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5419 << "LV: Vectorization seems to be not beneficial, " 5420 << "but was forced by a user.\n"); 5421 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5422 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5423 (unsigned)(Width * Cost)}; 5424 return Factor; 5425 } 5426 5427 std::pair<unsigned, unsigned> 5428 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5429 unsigned MinWidth = -1U; 5430 unsigned MaxWidth = 8; 5431 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5432 5433 // For each block. 5434 for (BasicBlock *BB : TheLoop->blocks()) { 5435 // For each instruction in the loop. 5436 for (Instruction &I : BB->instructionsWithoutDebug()) { 5437 Type *T = I.getType(); 5438 5439 // Skip ignored values. 5440 if (ValuesToIgnore.count(&I)) 5441 continue; 5442 5443 // Only examine Loads, Stores and PHINodes. 5444 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5445 continue; 5446 5447 // Examine PHI nodes that are reduction variables. Update the type to 5448 // account for the recurrence type. 5449 if (auto *PN = dyn_cast<PHINode>(&I)) { 5450 if (!Legal->isReductionVariable(PN)) 5451 continue; 5452 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5453 T = RdxDesc.getRecurrenceType(); 5454 } 5455 5456 // Examine the stored values. 5457 if (auto *ST = dyn_cast<StoreInst>(&I)) 5458 T = ST->getValueOperand()->getType(); 5459 5460 // Ignore loaded pointer types and stored pointer types that are not 5461 // vectorizable. 5462 // 5463 // FIXME: The check here attempts to predict whether a load or store will 5464 // be vectorized. We only know this for certain after a VF has 5465 // been selected. Here, we assume that if an access can be 5466 // vectorized, it will be. We should also look at extending this 5467 // optimization to non-pointer types. 5468 // 5469 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5470 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5471 continue; 5472 5473 MinWidth = std::min(MinWidth, 5474 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5475 MaxWidth = std::max(MaxWidth, 5476 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5477 } 5478 } 5479 5480 return {MinWidth, MaxWidth}; 5481 } 5482 5483 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5484 unsigned LoopCost) { 5485 // -- The interleave heuristics -- 5486 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5487 // There are many micro-architectural considerations that we can't predict 5488 // at this level. For example, frontend pressure (on decode or fetch) due to 5489 // code size, or the number and capabilities of the execution ports. 5490 // 5491 // We use the following heuristics to select the interleave count: 5492 // 1. If the code has reductions, then we interleave to break the cross 5493 // iteration dependency. 5494 // 2. If the loop is really small, then we interleave to reduce the loop 5495 // overhead. 5496 // 3. We don't interleave if we think that we will spill registers to memory 5497 // due to the increased register pressure. 5498 5499 if (!isScalarEpilogueAllowed()) 5500 return 1; 5501 5502 // We used the distance for the interleave count. 5503 if (Legal->getMaxSafeDepDistBytes() != -1U) 5504 return 1; 5505 5506 // Do not interleave loops with a relatively small known or estimated trip 5507 // count. 5508 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5509 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5510 return 1; 5511 5512 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5513 // We divide by these constants so assume that we have at least one 5514 // instruction that uses at least one register. 5515 for (auto& pair : R.MaxLocalUsers) { 5516 pair.second = std::max(pair.second, 1U); 5517 } 5518 5519 // We calculate the interleave count using the following formula. 5520 // Subtract the number of loop invariants from the number of available 5521 // registers. These registers are used by all of the interleaved instances. 5522 // Next, divide the remaining registers by the number of registers that is 5523 // required by the loop, in order to estimate how many parallel instances 5524 // fit without causing spills. All of this is rounded down if necessary to be 5525 // a power of two. We want power of two interleave count to simplify any 5526 // addressing operations or alignment considerations. 5527 // We also want power of two interleave counts to ensure that the induction 5528 // variable of the vector loop wraps to zero, when tail is folded by masking; 5529 // this currently happens when OptForSize, in which case IC is set to 1 above. 5530 unsigned IC = UINT_MAX; 5531 5532 for (auto& pair : R.MaxLocalUsers) { 5533 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5534 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5535 << " registers of " 5536 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5537 if (VF == 1) { 5538 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5539 TargetNumRegisters = ForceTargetNumScalarRegs; 5540 } else { 5541 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5542 TargetNumRegisters = ForceTargetNumVectorRegs; 5543 } 5544 unsigned MaxLocalUsers = pair.second; 5545 unsigned LoopInvariantRegs = 0; 5546 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5547 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5548 5549 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5550 // Don't count the induction variable as interleaved. 5551 if (EnableIndVarRegisterHeur) { 5552 TmpIC = 5553 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5554 std::max(1U, (MaxLocalUsers - 1))); 5555 } 5556 5557 IC = std::min(IC, TmpIC); 5558 } 5559 5560 // Clamp the interleave ranges to reasonable counts. 5561 assert(!VF.Scalable && "scalable vectors not yet supported."); 5562 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); 5563 5564 // Check if the user has overridden the max. 5565 if (VF == 1) { 5566 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5567 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5568 } else { 5569 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5570 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5571 } 5572 5573 // If trip count is known or estimated compile time constant, limit the 5574 // interleave count to be less than the trip count divided by VF. 5575 if (BestKnownTC) { 5576 MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); 5577 } 5578 5579 // If we did not calculate the cost for VF (because the user selected the VF) 5580 // then we calculate the cost of VF here. 5581 if (LoopCost == 0) 5582 LoopCost = expectedCost(VF).first; 5583 5584 assert(LoopCost && "Non-zero loop cost expected"); 5585 5586 // Clamp the calculated IC to be between the 1 and the max interleave count 5587 // that the target and trip count allows. 5588 if (IC > MaxInterleaveCount) 5589 IC = MaxInterleaveCount; 5590 else if (IC < 1) 5591 IC = 1; 5592 5593 // Interleave if we vectorized this loop and there is a reduction that could 5594 // benefit from interleaving. 5595 if (VF.isVector() && !Legal->getReductionVars().empty()) { 5596 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5597 return IC; 5598 } 5599 5600 // Note that if we've already vectorized the loop we will have done the 5601 // runtime check and so interleaving won't require further checks. 5602 bool InterleavingRequiresRuntimePointerCheck = 5603 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5604 5605 // We want to interleave small loops in order to reduce the loop overhead and 5606 // potentially expose ILP opportunities. 5607 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5608 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5609 // We assume that the cost overhead is 1 and we use the cost model 5610 // to estimate the cost of the loop and interleave until the cost of the 5611 // loop overhead is about 5% of the cost of the loop. 5612 unsigned SmallIC = 5613 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5614 5615 // Interleave until store/load ports (estimated by max interleave count) are 5616 // saturated. 5617 unsigned NumStores = Legal->getNumStores(); 5618 unsigned NumLoads = Legal->getNumLoads(); 5619 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5620 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5621 5622 // If we have a scalar reduction (vector reductions are already dealt with 5623 // by this point), we can increase the critical path length if the loop 5624 // we're interleaving is inside another loop. Limit, by default to 2, so the 5625 // critical path only gets increased by one reduction operation. 5626 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5627 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5628 SmallIC = std::min(SmallIC, F); 5629 StoresIC = std::min(StoresIC, F); 5630 LoadsIC = std::min(LoadsIC, F); 5631 } 5632 5633 if (EnableLoadStoreRuntimeInterleave && 5634 std::max(StoresIC, LoadsIC) > SmallIC) { 5635 LLVM_DEBUG( 5636 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5637 return std::max(StoresIC, LoadsIC); 5638 } 5639 5640 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5641 return SmallIC; 5642 } 5643 5644 // Interleave if this is a large loop (small loops are already dealt with by 5645 // this point) that could benefit from interleaving. 5646 bool HasReductions = !Legal->getReductionVars().empty(); 5647 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5648 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5649 return IC; 5650 } 5651 5652 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5653 return 1; 5654 } 5655 5656 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5657 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5658 // This function calculates the register usage by measuring the highest number 5659 // of values that are alive at a single location. Obviously, this is a very 5660 // rough estimation. We scan the loop in a topological order in order and 5661 // assign a number to each instruction. We use RPO to ensure that defs are 5662 // met before their users. We assume that each instruction that has in-loop 5663 // users starts an interval. We record every time that an in-loop value is 5664 // used, so we have a list of the first and last occurrences of each 5665 // instruction. Next, we transpose this data structure into a multi map that 5666 // holds the list of intervals that *end* at a specific location. This multi 5667 // map allows us to perform a linear search. We scan the instructions linearly 5668 // and record each time that a new interval starts, by placing it in a set. 5669 // If we find this value in the multi-map then we remove it from the set. 5670 // The max register usage is the maximum size of the set. 5671 // We also search for instructions that are defined outside the loop, but are 5672 // used inside the loop. We need this number separately from the max-interval 5673 // usage number because when we unroll, loop-invariant values do not take 5674 // more register. 5675 LoopBlocksDFS DFS(TheLoop); 5676 DFS.perform(LI); 5677 5678 RegisterUsage RU; 5679 5680 // Each 'key' in the map opens a new interval. The values 5681 // of the map are the index of the 'last seen' usage of the 5682 // instruction that is the key. 5683 using IntervalMap = DenseMap<Instruction *, unsigned>; 5684 5685 // Maps instruction to its index. 5686 SmallVector<Instruction *, 64> IdxToInstr; 5687 // Marks the end of each interval. 5688 IntervalMap EndPoint; 5689 // Saves the list of instruction indices that are used in the loop. 5690 SmallPtrSet<Instruction *, 8> Ends; 5691 // Saves the list of values that are used in the loop but are 5692 // defined outside the loop, such as arguments and constants. 5693 SmallPtrSet<Value *, 8> LoopInvariants; 5694 5695 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5696 for (Instruction &I : BB->instructionsWithoutDebug()) { 5697 IdxToInstr.push_back(&I); 5698 5699 // Save the end location of each USE. 5700 for (Value *U : I.operands()) { 5701 auto *Instr = dyn_cast<Instruction>(U); 5702 5703 // Ignore non-instruction values such as arguments, constants, etc. 5704 if (!Instr) 5705 continue; 5706 5707 // If this instruction is outside the loop then record it and continue. 5708 if (!TheLoop->contains(Instr)) { 5709 LoopInvariants.insert(Instr); 5710 continue; 5711 } 5712 5713 // Overwrite previous end points. 5714 EndPoint[Instr] = IdxToInstr.size(); 5715 Ends.insert(Instr); 5716 } 5717 } 5718 } 5719 5720 // Saves the list of intervals that end with the index in 'key'. 5721 using InstrList = SmallVector<Instruction *, 2>; 5722 DenseMap<unsigned, InstrList> TransposeEnds; 5723 5724 // Transpose the EndPoints to a list of values that end at each index. 5725 for (auto &Interval : EndPoint) 5726 TransposeEnds[Interval.second].push_back(Interval.first); 5727 5728 SmallPtrSet<Instruction *, 8> OpenIntervals; 5729 5730 // Get the size of the widest register. 5731 unsigned MaxSafeDepDist = -1U; 5732 if (Legal->getMaxSafeDepDistBytes() != -1U) 5733 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5734 unsigned WidestRegister = 5735 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5736 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5737 5738 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5739 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5740 5741 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5742 5743 // A lambda that gets the register usage for the given type and VF. 5744 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5745 if (Ty->isTokenTy()) 5746 return 0U; 5747 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5748 assert(!VF.Scalable && "scalable vectors not yet supported."); 5749 return std::max<unsigned>(1, VF.Min * TypeSize / WidestRegister); 5750 }; 5751 5752 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5753 Instruction *I = IdxToInstr[i]; 5754 5755 // Remove all of the instructions that end at this location. 5756 InstrList &List = TransposeEnds[i]; 5757 for (Instruction *ToRemove : List) 5758 OpenIntervals.erase(ToRemove); 5759 5760 // Ignore instructions that are never used within the loop. 5761 if (!Ends.count(I)) 5762 continue; 5763 5764 // Skip ignored values. 5765 if (ValuesToIgnore.count(I)) 5766 continue; 5767 5768 // For each VF find the maximum usage of registers. 5769 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5770 // Count the number of live intervals. 5771 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5772 5773 if (VFs[j].isScalar()) { 5774 for (auto Inst : OpenIntervals) { 5775 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5776 if (RegUsage.find(ClassID) == RegUsage.end()) 5777 RegUsage[ClassID] = 1; 5778 else 5779 RegUsage[ClassID] += 1; 5780 } 5781 } else { 5782 collectUniformsAndScalars(VFs[j]); 5783 for (auto Inst : OpenIntervals) { 5784 // Skip ignored values for VF > 1. 5785 if (VecValuesToIgnore.count(Inst)) 5786 continue; 5787 if (isScalarAfterVectorization(Inst, VFs[j])) { 5788 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5789 if (RegUsage.find(ClassID) == RegUsage.end()) 5790 RegUsage[ClassID] = 1; 5791 else 5792 RegUsage[ClassID] += 1; 5793 } else { 5794 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5795 if (RegUsage.find(ClassID) == RegUsage.end()) 5796 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5797 else 5798 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5799 } 5800 } 5801 } 5802 5803 for (auto& pair : RegUsage) { 5804 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5805 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5806 else 5807 MaxUsages[j][pair.first] = pair.second; 5808 } 5809 } 5810 5811 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5812 << OpenIntervals.size() << '\n'); 5813 5814 // Add the current instruction to the list of open intervals. 5815 OpenIntervals.insert(I); 5816 } 5817 5818 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5819 SmallMapVector<unsigned, unsigned, 4> Invariant; 5820 5821 for (auto Inst : LoopInvariants) { 5822 unsigned Usage = 5823 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5824 unsigned ClassID = 5825 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5826 if (Invariant.find(ClassID) == Invariant.end()) 5827 Invariant[ClassID] = Usage; 5828 else 5829 Invariant[ClassID] += Usage; 5830 } 5831 5832 LLVM_DEBUG({ 5833 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5834 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5835 << " item\n"; 5836 for (const auto &pair : MaxUsages[i]) { 5837 dbgs() << "LV(REG): RegisterClass: " 5838 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5839 << " registers\n"; 5840 } 5841 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5842 << " item\n"; 5843 for (const auto &pair : Invariant) { 5844 dbgs() << "LV(REG): RegisterClass: " 5845 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5846 << " registers\n"; 5847 } 5848 }); 5849 5850 RU.LoopInvariantRegs = Invariant; 5851 RU.MaxLocalUsers = MaxUsages[i]; 5852 RUs[i] = RU; 5853 } 5854 5855 return RUs; 5856 } 5857 5858 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5859 // TODO: Cost model for emulated masked load/store is completely 5860 // broken. This hack guides the cost model to use an artificially 5861 // high enough value to practically disable vectorization with such 5862 // operations, except where previously deployed legality hack allowed 5863 // using very low cost values. This is to avoid regressions coming simply 5864 // from moving "masked load/store" check from legality to cost model. 5865 // Masked Load/Gather emulation was previously never allowed. 5866 // Limited number of Masked Store/Scatter emulation was allowed. 5867 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5868 return isa<LoadInst>(I) || 5869 (isa<StoreInst>(I) && 5870 NumPredStores > NumberOfStoresToPredicate); 5871 } 5872 5873 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5874 // If we aren't vectorizing the loop, or if we've already collected the 5875 // instructions to scalarize, there's nothing to do. Collection may already 5876 // have occurred if we have a user-selected VF and are now computing the 5877 // expected cost for interleaving. 5878 if (VF.isScalar() || VF.isZero() || 5879 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5880 return; 5881 5882 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5883 // not profitable to scalarize any instructions, the presence of VF in the 5884 // map will indicate that we've analyzed it already. 5885 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5886 5887 // Find all the instructions that are scalar with predication in the loop and 5888 // determine if it would be better to not if-convert the blocks they are in. 5889 // If so, we also record the instructions to scalarize. 5890 for (BasicBlock *BB : TheLoop->blocks()) { 5891 if (!blockNeedsPredication(BB)) 5892 continue; 5893 for (Instruction &I : *BB) 5894 if (isScalarWithPredication(&I)) { 5895 ScalarCostsTy ScalarCosts; 5896 // Do not apply discount logic if hacked cost is needed 5897 // for emulated masked memrefs. 5898 if (!useEmulatedMaskMemRefHack(&I) && 5899 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5900 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5901 // Remember that BB will remain after vectorization. 5902 PredicatedBBsAfterVectorization.insert(BB); 5903 } 5904 } 5905 } 5906 5907 int LoopVectorizationCostModel::computePredInstDiscount( 5908 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5909 ElementCount VF) { 5910 assert(!isUniformAfterVectorization(PredInst, VF) && 5911 "Instruction marked uniform-after-vectorization will be predicated"); 5912 5913 // Initialize the discount to zero, meaning that the scalar version and the 5914 // vector version cost the same. 5915 int Discount = 0; 5916 5917 // Holds instructions to analyze. The instructions we visit are mapped in 5918 // ScalarCosts. Those instructions are the ones that would be scalarized if 5919 // we find that the scalar version costs less. 5920 SmallVector<Instruction *, 8> Worklist; 5921 5922 // Returns true if the given instruction can be scalarized. 5923 auto canBeScalarized = [&](Instruction *I) -> bool { 5924 // We only attempt to scalarize instructions forming a single-use chain 5925 // from the original predicated block that would otherwise be vectorized. 5926 // Although not strictly necessary, we give up on instructions we know will 5927 // already be scalar to avoid traversing chains that are unlikely to be 5928 // beneficial. 5929 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5930 isScalarAfterVectorization(I, VF)) 5931 return false; 5932 5933 // If the instruction is scalar with predication, it will be analyzed 5934 // separately. We ignore it within the context of PredInst. 5935 if (isScalarWithPredication(I)) 5936 return false; 5937 5938 // If any of the instruction's operands are uniform after vectorization, 5939 // the instruction cannot be scalarized. This prevents, for example, a 5940 // masked load from being scalarized. 5941 // 5942 // We assume we will only emit a value for lane zero of an instruction 5943 // marked uniform after vectorization, rather than VF identical values. 5944 // Thus, if we scalarize an instruction that uses a uniform, we would 5945 // create uses of values corresponding to the lanes we aren't emitting code 5946 // for. This behavior can be changed by allowing getScalarValue to clone 5947 // the lane zero values for uniforms rather than asserting. 5948 for (Use &U : I->operands()) 5949 if (auto *J = dyn_cast<Instruction>(U.get())) 5950 if (isUniformAfterVectorization(J, VF)) 5951 return false; 5952 5953 // Otherwise, we can scalarize the instruction. 5954 return true; 5955 }; 5956 5957 // Compute the expected cost discount from scalarizing the entire expression 5958 // feeding the predicated instruction. We currently only consider expressions 5959 // that are single-use instruction chains. 5960 Worklist.push_back(PredInst); 5961 while (!Worklist.empty()) { 5962 Instruction *I = Worklist.pop_back_val(); 5963 5964 // If we've already analyzed the instruction, there's nothing to do. 5965 if (ScalarCosts.find(I) != ScalarCosts.end()) 5966 continue; 5967 5968 // Compute the cost of the vector instruction. Note that this cost already 5969 // includes the scalarization overhead of the predicated instruction. 5970 unsigned VectorCost = getInstructionCost(I, VF).first; 5971 5972 // Compute the cost of the scalarized instruction. This cost is the cost of 5973 // the instruction as if it wasn't if-converted and instead remained in the 5974 // predicated block. We will scale this cost by block probability after 5975 // computing the scalarization overhead. 5976 assert(!VF.Scalable && "scalable vectors not yet supported."); 5977 unsigned ScalarCost = 5978 VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; 5979 5980 // Compute the scalarization overhead of needed insertelement instructions 5981 // and phi nodes. 5982 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5983 ScalarCost += TTI.getScalarizationOverhead( 5984 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5985 APInt::getAllOnesValue(VF.Min), true, false); 5986 assert(!VF.Scalable && "scalable vectors not yet supported."); 5987 ScalarCost += 5988 VF.Min * 5989 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 5990 } 5991 5992 // Compute the scalarization overhead of needed extractelement 5993 // instructions. For each of the instruction's operands, if the operand can 5994 // be scalarized, add it to the worklist; otherwise, account for the 5995 // overhead. 5996 for (Use &U : I->operands()) 5997 if (auto *J = dyn_cast<Instruction>(U.get())) { 5998 assert(VectorType::isValidElementType(J->getType()) && 5999 "Instruction has non-scalar type"); 6000 if (canBeScalarized(J)) 6001 Worklist.push_back(J); 6002 else if (needsExtract(J, VF)) { 6003 assert(!VF.Scalable && "scalable vectors not yet supported."); 6004 ScalarCost += TTI.getScalarizationOverhead( 6005 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6006 APInt::getAllOnesValue(VF.Min), false, true); 6007 } 6008 } 6009 6010 // Scale the total scalar cost by block probability. 6011 ScalarCost /= getReciprocalPredBlockProb(); 6012 6013 // Compute the discount. A non-negative discount means the vector version 6014 // of the instruction costs more, and scalarizing would be beneficial. 6015 Discount += VectorCost - ScalarCost; 6016 ScalarCosts[I] = ScalarCost; 6017 } 6018 6019 return Discount; 6020 } 6021 6022 LoopVectorizationCostModel::VectorizationCostTy 6023 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6024 assert(!VF.Scalable && "scalable vectors not yet supported."); 6025 VectorizationCostTy Cost; 6026 6027 // For each block. 6028 for (BasicBlock *BB : TheLoop->blocks()) { 6029 VectorizationCostTy BlockCost; 6030 6031 // For each instruction in the old loop. 6032 for (Instruction &I : BB->instructionsWithoutDebug()) { 6033 // Skip ignored values. 6034 if (ValuesToIgnore.count(&I) || 6035 (VF.isVector() && VecValuesToIgnore.count(&I))) 6036 continue; 6037 6038 VectorizationCostTy C = getInstructionCost(&I, VF); 6039 6040 // Check if we should override the cost. 6041 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6042 C.first = ForceTargetInstructionCost; 6043 6044 BlockCost.first += C.first; 6045 BlockCost.second |= C.second; 6046 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6047 << " for VF " << VF << " For instruction: " << I 6048 << '\n'); 6049 } 6050 6051 // If we are vectorizing a predicated block, it will have been 6052 // if-converted. This means that the block's instructions (aside from 6053 // stores and instructions that may divide by zero) will now be 6054 // unconditionally executed. For the scalar case, we may not always execute 6055 // the predicated block. Thus, scale the block's cost by the probability of 6056 // executing it. 6057 if (VF.isScalar() && blockNeedsPredication(BB)) 6058 BlockCost.first /= getReciprocalPredBlockProb(); 6059 6060 Cost.first += BlockCost.first; 6061 Cost.second |= BlockCost.second; 6062 } 6063 6064 return Cost; 6065 } 6066 6067 /// Gets Address Access SCEV after verifying that the access pattern 6068 /// is loop invariant except the induction variable dependence. 6069 /// 6070 /// This SCEV can be sent to the Target in order to estimate the address 6071 /// calculation cost. 6072 static const SCEV *getAddressAccessSCEV( 6073 Value *Ptr, 6074 LoopVectorizationLegality *Legal, 6075 PredicatedScalarEvolution &PSE, 6076 const Loop *TheLoop) { 6077 6078 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6079 if (!Gep) 6080 return nullptr; 6081 6082 // We are looking for a gep with all loop invariant indices except for one 6083 // which should be an induction variable. 6084 auto SE = PSE.getSE(); 6085 unsigned NumOperands = Gep->getNumOperands(); 6086 for (unsigned i = 1; i < NumOperands; ++i) { 6087 Value *Opd = Gep->getOperand(i); 6088 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6089 !Legal->isInductionVariable(Opd)) 6090 return nullptr; 6091 } 6092 6093 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6094 return PSE.getSCEV(Ptr); 6095 } 6096 6097 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6098 return Legal->hasStride(I->getOperand(0)) || 6099 Legal->hasStride(I->getOperand(1)); 6100 } 6101 6102 unsigned 6103 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6104 ElementCount VF) { 6105 assert(VF.isVector() && 6106 "Scalarization cost of instruction implies vectorization."); 6107 assert(!VF.Scalable && "scalable vectors not yet supported."); 6108 Type *ValTy = getMemInstValueType(I); 6109 auto SE = PSE.getSE(); 6110 6111 unsigned AS = getLoadStoreAddressSpace(I); 6112 Value *Ptr = getLoadStorePointerOperand(I); 6113 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6114 6115 // Figure out whether the access is strided and get the stride value 6116 // if it's known in compile time 6117 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6118 6119 // Get the cost of the scalar memory instruction and address computation. 6120 unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6121 6122 // Don't pass *I here, since it is scalar but will actually be part of a 6123 // vectorized loop where the user of it is a vectorized instruction. 6124 const Align Alignment = getLoadStoreAlignment(I); 6125 Cost += VF.Min * 6126 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6127 AS, TTI::TCK_RecipThroughput); 6128 6129 // Get the overhead of the extractelement and insertelement instructions 6130 // we might create due to scalarization. 6131 Cost += getScalarizationOverhead(I, VF); 6132 6133 // If we have a predicated store, it may not be executed for each vector 6134 // lane. Scale the cost by the probability of executing the predicated 6135 // block. 6136 if (isPredicatedInst(I)) { 6137 Cost /= getReciprocalPredBlockProb(); 6138 6139 if (useEmulatedMaskMemRefHack(I)) 6140 // Artificially setting to a high enough value to practically disable 6141 // vectorization with such operations. 6142 Cost = 3000000; 6143 } 6144 6145 return Cost; 6146 } 6147 6148 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6149 ElementCount VF) { 6150 Type *ValTy = getMemInstValueType(I); 6151 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6152 Value *Ptr = getLoadStorePointerOperand(I); 6153 unsigned AS = getLoadStoreAddressSpace(I); 6154 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6155 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6156 6157 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6158 "Stride should be 1 or -1 for consecutive memory access"); 6159 const Align Alignment = getLoadStoreAlignment(I); 6160 unsigned Cost = 0; 6161 if (Legal->isMaskRequired(I)) 6162 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6163 CostKind); 6164 else 6165 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6166 CostKind, I); 6167 6168 bool Reverse = ConsecutiveStride < 0; 6169 if (Reverse) 6170 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6171 return Cost; 6172 } 6173 6174 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6175 ElementCount VF) { 6176 Type *ValTy = getMemInstValueType(I); 6177 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6178 const Align Alignment = getLoadStoreAlignment(I); 6179 unsigned AS = getLoadStoreAddressSpace(I); 6180 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6181 if (isa<LoadInst>(I)) { 6182 return TTI.getAddressComputationCost(ValTy) + 6183 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6184 CostKind) + 6185 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6186 } 6187 StoreInst *SI = cast<StoreInst>(I); 6188 6189 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6190 return TTI.getAddressComputationCost(ValTy) + 6191 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6192 CostKind) + 6193 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 6194 Instruction::ExtractElement, 6195 VectorTy, VF.Min - 1)); 6196 } 6197 6198 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6199 ElementCount VF) { 6200 Type *ValTy = getMemInstValueType(I); 6201 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6202 const Align Alignment = getLoadStoreAlignment(I); 6203 const Value *Ptr = getLoadStorePointerOperand(I); 6204 6205 return TTI.getAddressComputationCost(VectorTy) + 6206 TTI.getGatherScatterOpCost( 6207 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6208 TargetTransformInfo::TCK_RecipThroughput, I); 6209 } 6210 6211 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6212 ElementCount VF) { 6213 Type *ValTy = getMemInstValueType(I); 6214 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6215 unsigned AS = getLoadStoreAddressSpace(I); 6216 6217 auto Group = getInterleavedAccessGroup(I); 6218 assert(Group && "Fail to get an interleaved access group."); 6219 6220 unsigned InterleaveFactor = Group->getFactor(); 6221 assert(!VF.Scalable && "scalable vectors not yet supported."); 6222 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6223 6224 // Holds the indices of existing members in an interleaved load group. 6225 // An interleaved store group doesn't need this as it doesn't allow gaps. 6226 SmallVector<unsigned, 4> Indices; 6227 if (isa<LoadInst>(I)) { 6228 for (unsigned i = 0; i < InterleaveFactor; i++) 6229 if (Group->getMember(i)) 6230 Indices.push_back(i); 6231 } 6232 6233 // Calculate the cost of the whole interleaved group. 6234 bool UseMaskForGaps = 6235 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6236 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6237 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6238 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6239 6240 if (Group->isReverse()) { 6241 // TODO: Add support for reversed masked interleaved access. 6242 assert(!Legal->isMaskRequired(I) && 6243 "Reverse masked interleaved access not supported."); 6244 Cost += Group->getNumMembers() * 6245 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6246 } 6247 return Cost; 6248 } 6249 6250 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6251 ElementCount VF) { 6252 // Calculate scalar cost only. Vectorization cost should be ready at this 6253 // moment. 6254 if (VF.isScalar()) { 6255 Type *ValTy = getMemInstValueType(I); 6256 const Align Alignment = getLoadStoreAlignment(I); 6257 unsigned AS = getLoadStoreAddressSpace(I); 6258 6259 return TTI.getAddressComputationCost(ValTy) + 6260 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6261 TTI::TCK_RecipThroughput, I); 6262 } 6263 return getWideningCost(I, VF); 6264 } 6265 6266 LoopVectorizationCostModel::VectorizationCostTy 6267 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6268 ElementCount VF) { 6269 assert(!VF.Scalable && 6270 "the cost model is not yet implemented for scalable vectorization"); 6271 // If we know that this instruction will remain uniform, check the cost of 6272 // the scalar version. 6273 if (isUniformAfterVectorization(I, VF)) 6274 VF = ElementCount::getFixed(1); 6275 6276 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6277 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6278 6279 // Forced scalars do not have any scalarization overhead. 6280 auto ForcedScalar = ForcedScalars.find(VF); 6281 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6282 auto InstSet = ForcedScalar->second; 6283 if (InstSet.count(I)) 6284 return VectorizationCostTy( 6285 (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), 6286 false); 6287 } 6288 6289 Type *VectorTy; 6290 unsigned C = getInstructionCost(I, VF, VectorTy); 6291 6292 bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && 6293 TTI.getNumberOfParts(VectorTy) < VF.Min; 6294 return VectorizationCostTy(C, TypeNotScalarized); 6295 } 6296 6297 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6298 ElementCount VF) { 6299 6300 assert(!VF.Scalable && 6301 "cannot compute scalarization overhead for scalable vectorization"); 6302 if (VF.isScalar()) 6303 return 0; 6304 6305 unsigned Cost = 0; 6306 Type *RetTy = ToVectorTy(I->getType(), VF); 6307 if (!RetTy->isVoidTy() && 6308 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6309 Cost += TTI.getScalarizationOverhead( 6310 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.Min), true, false); 6311 6312 // Some targets keep addresses scalar. 6313 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6314 return Cost; 6315 6316 // Some targets support efficient element stores. 6317 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6318 return Cost; 6319 6320 // Collect operands to consider. 6321 CallInst *CI = dyn_cast<CallInst>(I); 6322 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6323 6324 // Skip operands that do not require extraction/scalarization and do not incur 6325 // any overhead. 6326 return Cost + 6327 TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), 6328 VF.Min); 6329 } 6330 6331 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6332 assert(!VF.Scalable && "scalable vectors not yet supported."); 6333 if (VF.isScalar()) 6334 return; 6335 NumPredStores = 0; 6336 for (BasicBlock *BB : TheLoop->blocks()) { 6337 // For each instruction in the old loop. 6338 for (Instruction &I : *BB) { 6339 Value *Ptr = getLoadStorePointerOperand(&I); 6340 if (!Ptr) 6341 continue; 6342 6343 // TODO: We should generate better code and update the cost model for 6344 // predicated uniform stores. Today they are treated as any other 6345 // predicated store (see added test cases in 6346 // invariant-store-vectorization.ll). 6347 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6348 NumPredStores++; 6349 6350 if (Legal->isUniform(Ptr) && 6351 // Conditional loads and stores should be scalarized and predicated. 6352 // isScalarWithPredication cannot be used here since masked 6353 // gather/scatters are not considered scalar with predication. 6354 !Legal->blockNeedsPredication(I.getParent())) { 6355 // TODO: Avoid replicating loads and stores instead of 6356 // relying on instcombine to remove them. 6357 // Load: Scalar load + broadcast 6358 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6359 unsigned Cost = getUniformMemOpCost(&I, VF); 6360 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6361 continue; 6362 } 6363 6364 // We assume that widening is the best solution when possible. 6365 if (memoryInstructionCanBeWidened(&I, VF)) { 6366 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6367 int ConsecutiveStride = 6368 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6369 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6370 "Expected consecutive stride."); 6371 InstWidening Decision = 6372 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6373 setWideningDecision(&I, VF, Decision, Cost); 6374 continue; 6375 } 6376 6377 // Choose between Interleaving, Gather/Scatter or Scalarization. 6378 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6379 unsigned NumAccesses = 1; 6380 if (isAccessInterleaved(&I)) { 6381 auto Group = getInterleavedAccessGroup(&I); 6382 assert(Group && "Fail to get an interleaved access group."); 6383 6384 // Make one decision for the whole group. 6385 if (getWideningDecision(&I, VF) != CM_Unknown) 6386 continue; 6387 6388 NumAccesses = Group->getNumMembers(); 6389 if (interleavedAccessCanBeWidened(&I, VF)) 6390 InterleaveCost = getInterleaveGroupCost(&I, VF); 6391 } 6392 6393 unsigned GatherScatterCost = 6394 isLegalGatherOrScatter(&I) 6395 ? getGatherScatterCost(&I, VF) * NumAccesses 6396 : std::numeric_limits<unsigned>::max(); 6397 6398 unsigned ScalarizationCost = 6399 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6400 6401 // Choose better solution for the current VF, 6402 // write down this decision and use it during vectorization. 6403 unsigned Cost; 6404 InstWidening Decision; 6405 if (InterleaveCost <= GatherScatterCost && 6406 InterleaveCost < ScalarizationCost) { 6407 Decision = CM_Interleave; 6408 Cost = InterleaveCost; 6409 } else if (GatherScatterCost < ScalarizationCost) { 6410 Decision = CM_GatherScatter; 6411 Cost = GatherScatterCost; 6412 } else { 6413 Decision = CM_Scalarize; 6414 Cost = ScalarizationCost; 6415 } 6416 // If the instructions belongs to an interleave group, the whole group 6417 // receives the same decision. The whole group receives the cost, but 6418 // the cost will actually be assigned to one instruction. 6419 if (auto Group = getInterleavedAccessGroup(&I)) 6420 setWideningDecision(Group, VF, Decision, Cost); 6421 else 6422 setWideningDecision(&I, VF, Decision, Cost); 6423 } 6424 } 6425 6426 // Make sure that any load of address and any other address computation 6427 // remains scalar unless there is gather/scatter support. This avoids 6428 // inevitable extracts into address registers, and also has the benefit of 6429 // activating LSR more, since that pass can't optimize vectorized 6430 // addresses. 6431 if (TTI.prefersVectorizedAddressing()) 6432 return; 6433 6434 // Start with all scalar pointer uses. 6435 SmallPtrSet<Instruction *, 8> AddrDefs; 6436 for (BasicBlock *BB : TheLoop->blocks()) 6437 for (Instruction &I : *BB) { 6438 Instruction *PtrDef = 6439 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6440 if (PtrDef && TheLoop->contains(PtrDef) && 6441 getWideningDecision(&I, VF) != CM_GatherScatter) 6442 AddrDefs.insert(PtrDef); 6443 } 6444 6445 // Add all instructions used to generate the addresses. 6446 SmallVector<Instruction *, 4> Worklist; 6447 for (auto *I : AddrDefs) 6448 Worklist.push_back(I); 6449 while (!Worklist.empty()) { 6450 Instruction *I = Worklist.pop_back_val(); 6451 for (auto &Op : I->operands()) 6452 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6453 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6454 AddrDefs.insert(InstOp).second) 6455 Worklist.push_back(InstOp); 6456 } 6457 6458 for (auto *I : AddrDefs) { 6459 if (isa<LoadInst>(I)) { 6460 // Setting the desired widening decision should ideally be handled in 6461 // by cost functions, but since this involves the task of finding out 6462 // if the loaded register is involved in an address computation, it is 6463 // instead changed here when we know this is the case. 6464 InstWidening Decision = getWideningDecision(I, VF); 6465 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6466 // Scalarize a widened load of address. 6467 setWideningDecision( 6468 I, VF, CM_Scalarize, 6469 (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6470 else if (auto Group = getInterleavedAccessGroup(I)) { 6471 // Scalarize an interleave group of address loads. 6472 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6473 if (Instruction *Member = Group->getMember(I)) 6474 setWideningDecision( 6475 Member, VF, CM_Scalarize, 6476 (VF.Min * 6477 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6478 } 6479 } 6480 } else 6481 // Make sure I gets scalarized and a cost estimate without 6482 // scalarization overhead. 6483 ForcedScalars[VF].insert(I); 6484 } 6485 } 6486 6487 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6488 ElementCount VF, 6489 Type *&VectorTy) { 6490 Type *RetTy = I->getType(); 6491 if (canTruncateToMinimalBitwidth(I, VF)) 6492 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6493 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6494 auto SE = PSE.getSE(); 6495 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6496 6497 // TODO: We need to estimate the cost of intrinsic calls. 6498 switch (I->getOpcode()) { 6499 case Instruction::GetElementPtr: 6500 // We mark this instruction as zero-cost because the cost of GEPs in 6501 // vectorized code depends on whether the corresponding memory instruction 6502 // is scalarized or not. Therefore, we handle GEPs with the memory 6503 // instruction cost. 6504 return 0; 6505 case Instruction::Br: { 6506 // In cases of scalarized and predicated instructions, there will be VF 6507 // predicated blocks in the vectorized loop. Each branch around these 6508 // blocks requires also an extract of its vector compare i1 element. 6509 bool ScalarPredicatedBB = false; 6510 BranchInst *BI = cast<BranchInst>(I); 6511 if (VF.isVector() && BI->isConditional() && 6512 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6513 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6514 ScalarPredicatedBB = true; 6515 6516 if (ScalarPredicatedBB) { 6517 // Return cost for branches around scalarized and predicated blocks. 6518 assert(!VF.Scalable && "scalable vectors not yet supported."); 6519 auto *Vec_i1Ty = 6520 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6521 return (TTI.getScalarizationOverhead( 6522 Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + 6523 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); 6524 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6525 // The back-edge branch will remain, as will all scalar branches. 6526 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6527 else 6528 // This branch will be eliminated by if-conversion. 6529 return 0; 6530 // Note: We currently assume zero cost for an unconditional branch inside 6531 // a predicated block since it will become a fall-through, although we 6532 // may decide in the future to call TTI for all branches. 6533 } 6534 case Instruction::PHI: { 6535 auto *Phi = cast<PHINode>(I); 6536 6537 // First-order recurrences are replaced by vector shuffles inside the loop. 6538 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6539 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6540 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6541 cast<VectorType>(VectorTy), VF.Min - 1, 6542 FixedVectorType::get(RetTy, 1)); 6543 6544 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6545 // converted into select instructions. We require N - 1 selects per phi 6546 // node, where N is the number of incoming values. 6547 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6548 return (Phi->getNumIncomingValues() - 1) * 6549 TTI.getCmpSelInstrCost( 6550 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6551 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6552 CostKind); 6553 6554 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6555 } 6556 case Instruction::UDiv: 6557 case Instruction::SDiv: 6558 case Instruction::URem: 6559 case Instruction::SRem: 6560 // If we have a predicated instruction, it may not be executed for each 6561 // vector lane. Get the scalarization cost and scale this amount by the 6562 // probability of executing the predicated block. If the instruction is not 6563 // predicated, we fall through to the next case. 6564 if (VF.isVector() && isScalarWithPredication(I)) { 6565 unsigned Cost = 0; 6566 6567 // These instructions have a non-void type, so account for the phi nodes 6568 // that we will create. This cost is likely to be zero. The phi node 6569 // cost, if any, should be scaled by the block probability because it 6570 // models a copy at the end of each predicated block. 6571 Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6572 6573 // The cost of the non-predicated instruction. 6574 Cost += 6575 VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6576 6577 // The cost of insertelement and extractelement instructions needed for 6578 // scalarization. 6579 Cost += getScalarizationOverhead(I, VF); 6580 6581 // Scale the cost by the probability of executing the predicated blocks. 6582 // This assumes the predicated block for each vector lane is equally 6583 // likely. 6584 return Cost / getReciprocalPredBlockProb(); 6585 } 6586 LLVM_FALLTHROUGH; 6587 case Instruction::Add: 6588 case Instruction::FAdd: 6589 case Instruction::Sub: 6590 case Instruction::FSub: 6591 case Instruction::Mul: 6592 case Instruction::FMul: 6593 case Instruction::FDiv: 6594 case Instruction::FRem: 6595 case Instruction::Shl: 6596 case Instruction::LShr: 6597 case Instruction::AShr: 6598 case Instruction::And: 6599 case Instruction::Or: 6600 case Instruction::Xor: { 6601 // Since we will replace the stride by 1 the multiplication should go away. 6602 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6603 return 0; 6604 // Certain instructions can be cheaper to vectorize if they have a constant 6605 // second vector operand. One example of this are shifts on x86. 6606 Value *Op2 = I->getOperand(1); 6607 TargetTransformInfo::OperandValueProperties Op2VP; 6608 TargetTransformInfo::OperandValueKind Op2VK = 6609 TTI.getOperandInfo(Op2, Op2VP); 6610 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6611 Op2VK = TargetTransformInfo::OK_UniformValue; 6612 6613 SmallVector<const Value *, 4> Operands(I->operand_values()); 6614 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6615 return N * TTI.getArithmeticInstrCost( 6616 I->getOpcode(), VectorTy, CostKind, 6617 TargetTransformInfo::OK_AnyValue, 6618 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6619 } 6620 case Instruction::FNeg: { 6621 assert(!VF.Scalable && "VF is assumed to be non scalable."); 6622 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6623 return N * TTI.getArithmeticInstrCost( 6624 I->getOpcode(), VectorTy, CostKind, 6625 TargetTransformInfo::OK_AnyValue, 6626 TargetTransformInfo::OK_AnyValue, 6627 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6628 I->getOperand(0), I); 6629 } 6630 case Instruction::Select: { 6631 SelectInst *SI = cast<SelectInst>(I); 6632 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6633 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6634 Type *CondTy = SI->getCondition()->getType(); 6635 if (!ScalarCond) { 6636 assert(!VF.Scalable && "VF is assumed to be non scalable."); 6637 CondTy = VectorType::get(CondTy, VF); 6638 } 6639 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6640 CostKind, I); 6641 } 6642 case Instruction::ICmp: 6643 case Instruction::FCmp: { 6644 Type *ValTy = I->getOperand(0)->getType(); 6645 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6646 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6647 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6648 VectorTy = ToVectorTy(ValTy, VF); 6649 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6650 I); 6651 } 6652 case Instruction::Store: 6653 case Instruction::Load: { 6654 ElementCount Width = VF; 6655 if (Width.isVector()) { 6656 InstWidening Decision = getWideningDecision(I, Width); 6657 assert(Decision != CM_Unknown && 6658 "CM decision should be taken at this point"); 6659 if (Decision == CM_Scalarize) 6660 Width = ElementCount::getFixed(1); 6661 } 6662 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6663 return getMemoryInstructionCost(I, VF); 6664 } 6665 case Instruction::ZExt: 6666 case Instruction::SExt: 6667 case Instruction::FPToUI: 6668 case Instruction::FPToSI: 6669 case Instruction::FPExt: 6670 case Instruction::PtrToInt: 6671 case Instruction::IntToPtr: 6672 case Instruction::SIToFP: 6673 case Instruction::UIToFP: 6674 case Instruction::Trunc: 6675 case Instruction::FPTrunc: 6676 case Instruction::BitCast: { 6677 // Computes the CastContextHint from a Load/Store instruction. 6678 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6679 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6680 "Expected a load or a store!"); 6681 6682 if (VF.isScalar() || !TheLoop->contains(I)) 6683 return TTI::CastContextHint::Normal; 6684 6685 switch (getWideningDecision(I, VF)) { 6686 case LoopVectorizationCostModel::CM_GatherScatter: 6687 return TTI::CastContextHint::GatherScatter; 6688 case LoopVectorizationCostModel::CM_Interleave: 6689 return TTI::CastContextHint::Interleave; 6690 case LoopVectorizationCostModel::CM_Scalarize: 6691 case LoopVectorizationCostModel::CM_Widen: 6692 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6693 : TTI::CastContextHint::Normal; 6694 case LoopVectorizationCostModel::CM_Widen_Reverse: 6695 return TTI::CastContextHint::Reversed; 6696 case LoopVectorizationCostModel::CM_Unknown: 6697 llvm_unreachable("Instr did not go through cost modelling?"); 6698 } 6699 6700 llvm_unreachable("Unhandled case!"); 6701 }; 6702 6703 unsigned Opcode = I->getOpcode(); 6704 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6705 // For Trunc, the context is the only user, which must be a StoreInst. 6706 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6707 if (I->hasOneUse()) 6708 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6709 CCH = ComputeCCH(Store); 6710 } 6711 // For Z/Sext, the context is the operand, which must be a LoadInst. 6712 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6713 Opcode == Instruction::FPExt) { 6714 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6715 CCH = ComputeCCH(Load); 6716 } 6717 6718 // We optimize the truncation of induction variables having constant 6719 // integer steps. The cost of these truncations is the same as the scalar 6720 // operation. 6721 if (isOptimizableIVTruncate(I, VF)) { 6722 auto *Trunc = cast<TruncInst>(I); 6723 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6724 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6725 } 6726 6727 Type *SrcScalarTy = I->getOperand(0)->getType(); 6728 Type *SrcVecTy = 6729 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6730 if (canTruncateToMinimalBitwidth(I, VF)) { 6731 // This cast is going to be shrunk. This may remove the cast or it might 6732 // turn it into slightly different cast. For example, if MinBW == 16, 6733 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6734 // 6735 // Calculate the modified src and dest types. 6736 Type *MinVecTy = VectorTy; 6737 if (Opcode == Instruction::Trunc) { 6738 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6739 VectorTy = 6740 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6741 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6742 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6743 VectorTy = 6744 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6745 } 6746 } 6747 6748 assert(!VF.Scalable && "VF is assumed to be non scalable"); 6749 unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; 6750 return N * 6751 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6752 } 6753 case Instruction::Call: { 6754 bool NeedToScalarize; 6755 CallInst *CI = cast<CallInst>(I); 6756 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6757 if (getVectorIntrinsicIDForCall(CI, TLI)) 6758 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6759 return CallCost; 6760 } 6761 default: 6762 // The cost of executing VF copies of the scalar instruction. This opcode 6763 // is unknown. Assume that it is the same as 'mul'. 6764 return VF.Min * 6765 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, 6766 CostKind) + 6767 getScalarizationOverhead(I, VF); 6768 } // end of switch. 6769 } 6770 6771 char LoopVectorize::ID = 0; 6772 6773 static const char lv_name[] = "Loop Vectorization"; 6774 6775 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6776 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6777 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6778 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6779 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6780 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6781 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6782 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6783 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6784 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6785 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6786 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6787 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6788 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6789 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6790 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6791 6792 namespace llvm { 6793 6794 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6795 6796 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6797 bool VectorizeOnlyWhenForced) { 6798 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6799 } 6800 6801 } // end namespace llvm 6802 6803 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6804 // Check if the pointer operand of a load or store instruction is 6805 // consecutive. 6806 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6807 return Legal->isConsecutivePtr(Ptr); 6808 return false; 6809 } 6810 6811 void LoopVectorizationCostModel::collectValuesToIgnore() { 6812 // Ignore ephemeral values. 6813 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6814 6815 // Ignore type-promoting instructions we identified during reduction 6816 // detection. 6817 for (auto &Reduction : Legal->getReductionVars()) { 6818 RecurrenceDescriptor &RedDes = Reduction.second; 6819 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6820 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6821 } 6822 // Ignore type-casting instructions we identified during induction 6823 // detection. 6824 for (auto &Induction : Legal->getInductionVars()) { 6825 InductionDescriptor &IndDes = Induction.second; 6826 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6827 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6828 } 6829 } 6830 6831 void LoopVectorizationCostModel::collectInLoopReductions() { 6832 // For the moment, without predicated reduction instructions, we do not 6833 // support inloop reductions whilst folding the tail, and hence in those cases 6834 // all reductions are currently out of the loop. 6835 if (!PreferInLoopReductions || foldTailByMasking()) 6836 return; 6837 6838 for (auto &Reduction : Legal->getReductionVars()) { 6839 PHINode *Phi = Reduction.first; 6840 RecurrenceDescriptor &RdxDesc = Reduction.second; 6841 6842 // We don't collect reductions that are type promoted (yet). 6843 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6844 continue; 6845 6846 // Check that we can correctly put the reductions into the loop, by 6847 // finding the chain of operations that leads from the phi to the loop 6848 // exit value. 6849 SmallVector<Instruction *, 4> ReductionOperations = 6850 RdxDesc.getReductionOpChain(Phi, TheLoop); 6851 bool InLoop = !ReductionOperations.empty(); 6852 if (InLoop) 6853 InLoopReductionChains[Phi] = ReductionOperations; 6854 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6855 << " reduction for phi: " << *Phi << "\n"); 6856 } 6857 } 6858 6859 // TODO: we could return a pair of values that specify the max VF and 6860 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6861 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6862 // doesn't have a cost model that can choose which plan to execute if 6863 // more than one is generated. 6864 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6865 LoopVectorizationCostModel &CM) { 6866 unsigned WidestType; 6867 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6868 return WidestVectorRegBits / WidestType; 6869 } 6870 6871 VectorizationFactor 6872 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6873 assert(!UserVF.Scalable && "scalable vectors not yet supported"); 6874 ElementCount VF = UserVF; 6875 // Outer loop handling: They may require CFG and instruction level 6876 // transformations before even evaluating whether vectorization is profitable. 6877 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6878 // the vectorization pipeline. 6879 if (!OrigLoop->empty()) { 6880 // If the user doesn't provide a vectorization factor, determine a 6881 // reasonable one. 6882 if (UserVF.isZero()) { 6883 VF = ElementCount::getFixed( 6884 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6885 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6886 6887 // Make sure we have a VF > 1 for stress testing. 6888 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6889 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6890 << "overriding computed VF.\n"); 6891 VF = ElementCount::getFixed(4); 6892 } 6893 } 6894 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6895 assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); 6896 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6897 << "VF " << VF << " to build VPlans.\n"); 6898 buildVPlans(VF.Min, VF.Min); 6899 6900 // For VPlan build stress testing, we bail out after VPlan construction. 6901 if (VPlanBuildStressTest) 6902 return VectorizationFactor::Disabled(); 6903 6904 return {VF, 0 /*Cost*/}; 6905 } 6906 6907 LLVM_DEBUG( 6908 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6909 "VPlan-native path.\n"); 6910 return VectorizationFactor::Disabled(); 6911 } 6912 6913 Optional<VectorizationFactor> 6914 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6915 assert(!UserVF.Scalable && "scalable vectorization not yet handled"); 6916 assert(OrigLoop->empty() && "Inner loop expected."); 6917 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); 6918 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6919 return None; 6920 6921 // Invalidate interleave groups if all blocks of loop will be predicated. 6922 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6923 !useMaskedInterleavedAccesses(*TTI)) { 6924 LLVM_DEBUG( 6925 dbgs() 6926 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6927 "which requires masked-interleaved support.\n"); 6928 if (CM.InterleaveInfo.invalidateGroups()) 6929 // Invalidating interleave groups also requires invalidating all decisions 6930 // based on them, which includes widening decisions and uniform and scalar 6931 // values. 6932 CM.invalidateCostModelingDecisions(); 6933 } 6934 6935 if (!UserVF.isZero()) { 6936 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6937 assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); 6938 // Collect the instructions (and their associated costs) that will be more 6939 // profitable to scalarize. 6940 CM.selectUserVectorizationFactor(UserVF); 6941 CM.collectInLoopReductions(); 6942 buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); 6943 LLVM_DEBUG(printPlans(dbgs())); 6944 return {{UserVF, 0}}; 6945 } 6946 6947 unsigned MaxVF = MaybeMaxVF.getValue(); 6948 assert(MaxVF != 0 && "MaxVF is zero."); 6949 6950 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6951 // Collect Uniform and Scalar instructions after vectorization with VF. 6952 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 6953 6954 // Collect the instructions (and their associated costs) that will be more 6955 // profitable to scalarize. 6956 if (VF > 1) 6957 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 6958 } 6959 6960 CM.collectInLoopReductions(); 6961 6962 buildVPlansWithVPRecipes(1, MaxVF); 6963 LLVM_DEBUG(printPlans(dbgs())); 6964 if (MaxVF == 1) 6965 return VectorizationFactor::Disabled(); 6966 6967 // Select the optimal vectorization factor. 6968 return CM.selectVectorizationFactor(MaxVF); 6969 } 6970 6971 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 6972 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6973 << '\n'); 6974 BestVF = VF; 6975 BestUF = UF; 6976 6977 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6978 return !Plan->hasVF(VF); 6979 }); 6980 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6981 } 6982 6983 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6984 DominatorTree *DT) { 6985 // Perform the actual loop transformation. 6986 6987 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6988 VPCallbackILV CallbackILV(ILV); 6989 6990 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 6991 6992 VPTransformState State{*BestVF, BestUF, LI, 6993 DT, ILV.Builder, ILV.VectorLoopValueMap, 6994 &ILV, CallbackILV}; 6995 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6996 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6997 State.CanonicalIV = ILV.Induction; 6998 6999 //===------------------------------------------------===// 7000 // 7001 // Notice: any optimization or new instruction that go 7002 // into the code below should also be implemented in 7003 // the cost-model. 7004 // 7005 //===------------------------------------------------===// 7006 7007 // 2. Copy and widen instructions from the old loop into the new loop. 7008 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7009 VPlans.front()->execute(&State); 7010 7011 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7012 // predication, updating analyses. 7013 ILV.fixVectorizedLoop(); 7014 } 7015 7016 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7017 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7018 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7019 7020 // We create new control-flow for the vectorized loop, so the original 7021 // condition will be dead after vectorization if it's only used by the 7022 // branch. 7023 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7024 if (Cmp && Cmp->hasOneUse()) 7025 DeadInstructions.insert(Cmp); 7026 7027 // We create new "steps" for induction variable updates to which the original 7028 // induction variables map. An original update instruction will be dead if 7029 // all its users except the induction variable are dead. 7030 for (auto &Induction : Legal->getInductionVars()) { 7031 PHINode *Ind = Induction.first; 7032 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7033 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7034 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7035 })) 7036 DeadInstructions.insert(IndUpdate); 7037 7038 // We record as "Dead" also the type-casting instructions we had identified 7039 // during induction analysis. We don't need any handling for them in the 7040 // vectorized loop because we have proven that, under a proper runtime 7041 // test guarding the vectorized loop, the value of the phi, and the casted 7042 // value of the phi, are the same. The last instruction in this casting chain 7043 // will get its scalar/vector/widened def from the scalar/vector/widened def 7044 // of the respective phi node. Any other casts in the induction def-use chain 7045 // have no other uses outside the phi update chain, and will be ignored. 7046 InductionDescriptor &IndDes = Induction.second; 7047 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7048 DeadInstructions.insert(Casts.begin(), Casts.end()); 7049 } 7050 } 7051 7052 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7053 7054 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7055 7056 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7057 Instruction::BinaryOps BinOp) { 7058 // When unrolling and the VF is 1, we only need to add a simple scalar. 7059 Type *Ty = Val->getType(); 7060 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7061 7062 if (Ty->isFloatingPointTy()) { 7063 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7064 7065 // Floating point operations had to be 'fast' to enable the unrolling. 7066 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7067 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7068 } 7069 Constant *C = ConstantInt::get(Ty, StartIdx); 7070 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7071 } 7072 7073 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7074 SmallVector<Metadata *, 4> MDs; 7075 // Reserve first location for self reference to the LoopID metadata node. 7076 MDs.push_back(nullptr); 7077 bool IsUnrollMetadata = false; 7078 MDNode *LoopID = L->getLoopID(); 7079 if (LoopID) { 7080 // First find existing loop unrolling disable metadata. 7081 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7082 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7083 if (MD) { 7084 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7085 IsUnrollMetadata = 7086 S && S->getString().startswith("llvm.loop.unroll.disable"); 7087 } 7088 MDs.push_back(LoopID->getOperand(i)); 7089 } 7090 } 7091 7092 if (!IsUnrollMetadata) { 7093 // Add runtime unroll disable metadata. 7094 LLVMContext &Context = L->getHeader()->getContext(); 7095 SmallVector<Metadata *, 1> DisableOperands; 7096 DisableOperands.push_back( 7097 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7098 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7099 MDs.push_back(DisableNode); 7100 MDNode *NewLoopID = MDNode::get(Context, MDs); 7101 // Set operand 0 to refer to the loop id itself. 7102 NewLoopID->replaceOperandWith(0, NewLoopID); 7103 L->setLoopID(NewLoopID); 7104 } 7105 } 7106 7107 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7108 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7109 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7110 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7111 7112 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7113 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7114 Range.End = TmpVF; 7115 break; 7116 } 7117 7118 return PredicateAtRangeStart; 7119 } 7120 7121 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7122 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7123 /// of VF's starting at a given VF and extending it as much as possible. Each 7124 /// vectorization decision can potentially shorten this sub-range during 7125 /// buildVPlan(). 7126 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7127 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7128 VFRange SubRange = {VF, MaxVF + 1}; 7129 VPlans.push_back(buildVPlan(SubRange)); 7130 VF = SubRange.End; 7131 } 7132 } 7133 7134 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7135 VPlanPtr &Plan) { 7136 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7137 7138 // Look for cached value. 7139 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7140 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7141 if (ECEntryIt != EdgeMaskCache.end()) 7142 return ECEntryIt->second; 7143 7144 VPValue *SrcMask = createBlockInMask(Src, Plan); 7145 7146 // The terminator has to be a branch inst! 7147 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7148 assert(BI && "Unexpected terminator found"); 7149 7150 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7151 return EdgeMaskCache[Edge] = SrcMask; 7152 7153 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7154 assert(EdgeMask && "No Edge Mask found for condition"); 7155 7156 if (BI->getSuccessor(0) != Dst) 7157 EdgeMask = Builder.createNot(EdgeMask); 7158 7159 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7160 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7161 7162 return EdgeMaskCache[Edge] = EdgeMask; 7163 } 7164 7165 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7166 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7167 7168 // Look for cached value. 7169 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7170 if (BCEntryIt != BlockMaskCache.end()) 7171 return BCEntryIt->second; 7172 7173 // All-one mask is modelled as no-mask following the convention for masked 7174 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7175 VPValue *BlockMask = nullptr; 7176 7177 if (OrigLoop->getHeader() == BB) { 7178 if (!CM.blockNeedsPredication(BB)) 7179 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7180 7181 // Introduce the early-exit compare IV <= BTC to form header block mask. 7182 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7183 // Start by constructing the desired canonical IV. 7184 VPValue *IV = nullptr; 7185 if (Legal->getPrimaryInduction()) 7186 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7187 else { 7188 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7189 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7190 IV = IVRecipe->getVPValue(); 7191 } 7192 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7193 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7194 7195 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7196 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7197 // as a second argument, we only pass the IV here and extract the 7198 // tripcount from the transform state where codegen of the VP instructions 7199 // happen. 7200 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7201 } else { 7202 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7203 } 7204 return BlockMaskCache[BB] = BlockMask; 7205 } 7206 7207 // This is the block mask. We OR all incoming edges. 7208 for (auto *Predecessor : predecessors(BB)) { 7209 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7210 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7211 return BlockMaskCache[BB] = EdgeMask; 7212 7213 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7214 BlockMask = EdgeMask; 7215 continue; 7216 } 7217 7218 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7219 } 7220 7221 return BlockMaskCache[BB] = BlockMask; 7222 } 7223 7224 VPWidenMemoryInstructionRecipe * 7225 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7226 VPlanPtr &Plan) { 7227 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7228 "Must be called with either a load or store"); 7229 7230 auto willWiden = [&](ElementCount VF) -> bool { 7231 assert(!VF.Scalable && "unexpected scalable ElementCount"); 7232 if (VF.isScalar()) 7233 return false; 7234 LoopVectorizationCostModel::InstWidening Decision = 7235 CM.getWideningDecision(I, VF); 7236 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7237 "CM decision should be taken at this point."); 7238 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7239 return true; 7240 if (CM.isScalarAfterVectorization(I, VF) || 7241 CM.isProfitableToScalarize(I, VF)) 7242 return false; 7243 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7244 }; 7245 7246 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7247 return nullptr; 7248 7249 VPValue *Mask = nullptr; 7250 if (Legal->isMaskRequired(I)) 7251 Mask = createBlockInMask(I->getParent(), Plan); 7252 7253 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7254 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7255 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7256 7257 StoreInst *Store = cast<StoreInst>(I); 7258 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7259 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7260 } 7261 7262 VPWidenIntOrFpInductionRecipe * 7263 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7264 // Check if this is an integer or fp induction. If so, build the recipe that 7265 // produces its scalar and vector values. 7266 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7267 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7268 II.getKind() == InductionDescriptor::IK_FpInduction) 7269 return new VPWidenIntOrFpInductionRecipe(Phi); 7270 7271 return nullptr; 7272 } 7273 7274 VPWidenIntOrFpInductionRecipe * 7275 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7276 VFRange &Range) const { 7277 // Optimize the special case where the source is a constant integer 7278 // induction variable. Notice that we can only optimize the 'trunc' case 7279 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7280 // (c) other casts depend on pointer size. 7281 7282 // Determine whether \p K is a truncation based on an induction variable that 7283 // can be optimized. 7284 auto isOptimizableIVTruncate = 7285 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7286 return [=](ElementCount VF) -> bool { 7287 return CM.isOptimizableIVTruncate(K, VF); 7288 }; 7289 }; 7290 7291 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7292 isOptimizableIVTruncate(I), Range)) 7293 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7294 I); 7295 return nullptr; 7296 } 7297 7298 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7299 // We know that all PHIs in non-header blocks are converted into selects, so 7300 // we don't have to worry about the insertion order and we can just use the 7301 // builder. At this point we generate the predication tree. There may be 7302 // duplications since this is a simple recursive scan, but future 7303 // optimizations will clean it up. 7304 7305 SmallVector<VPValue *, 2> Operands; 7306 unsigned NumIncoming = Phi->getNumIncomingValues(); 7307 for (unsigned In = 0; In < NumIncoming; In++) { 7308 VPValue *EdgeMask = 7309 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7310 assert((EdgeMask || NumIncoming == 1) && 7311 "Multiple predecessors with one having a full mask"); 7312 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7313 if (EdgeMask) 7314 Operands.push_back(EdgeMask); 7315 } 7316 return new VPBlendRecipe(Phi, Operands); 7317 } 7318 7319 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7320 VPlan &Plan) const { 7321 7322 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7323 [this, CI](ElementCount VF) { 7324 return CM.isScalarWithPredication(CI, VF); 7325 }, 7326 Range); 7327 7328 if (IsPredicated) 7329 return nullptr; 7330 7331 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7332 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7333 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7334 return nullptr; 7335 7336 auto willWiden = [&](ElementCount VF) -> bool { 7337 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7338 // The following case may be scalarized depending on the VF. 7339 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7340 // version of the instruction. 7341 // Is it beneficial to perform intrinsic call compared to lib call? 7342 bool NeedToScalarize = false; 7343 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7344 bool UseVectorIntrinsic = 7345 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7346 return UseVectorIntrinsic || !NeedToScalarize; 7347 }; 7348 7349 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7350 return nullptr; 7351 7352 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7353 } 7354 7355 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7356 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7357 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7358 // Instruction should be widened, unless it is scalar after vectorization, 7359 // scalarization is profitable or it is predicated. 7360 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7361 return CM.isScalarAfterVectorization(I, VF) || 7362 CM.isProfitableToScalarize(I, VF) || 7363 CM.isScalarWithPredication(I, VF); 7364 }; 7365 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7366 Range); 7367 } 7368 7369 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7370 auto IsVectorizableOpcode = [](unsigned Opcode) { 7371 switch (Opcode) { 7372 case Instruction::Add: 7373 case Instruction::And: 7374 case Instruction::AShr: 7375 case Instruction::BitCast: 7376 case Instruction::FAdd: 7377 case Instruction::FCmp: 7378 case Instruction::FDiv: 7379 case Instruction::FMul: 7380 case Instruction::FNeg: 7381 case Instruction::FPExt: 7382 case Instruction::FPToSI: 7383 case Instruction::FPToUI: 7384 case Instruction::FPTrunc: 7385 case Instruction::FRem: 7386 case Instruction::FSub: 7387 case Instruction::ICmp: 7388 case Instruction::IntToPtr: 7389 case Instruction::LShr: 7390 case Instruction::Mul: 7391 case Instruction::Or: 7392 case Instruction::PtrToInt: 7393 case Instruction::SDiv: 7394 case Instruction::Select: 7395 case Instruction::SExt: 7396 case Instruction::Shl: 7397 case Instruction::SIToFP: 7398 case Instruction::SRem: 7399 case Instruction::Sub: 7400 case Instruction::Trunc: 7401 case Instruction::UDiv: 7402 case Instruction::UIToFP: 7403 case Instruction::URem: 7404 case Instruction::Xor: 7405 case Instruction::ZExt: 7406 return true; 7407 } 7408 return false; 7409 }; 7410 7411 if (!IsVectorizableOpcode(I->getOpcode())) 7412 return nullptr; 7413 7414 // Success: widen this instruction. 7415 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7416 } 7417 7418 VPBasicBlock *VPRecipeBuilder::handleReplication( 7419 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7420 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7421 VPlanPtr &Plan) { 7422 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7423 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7424 Range); 7425 7426 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7427 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7428 Range); 7429 7430 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7431 IsUniform, IsPredicated); 7432 setRecipe(I, Recipe); 7433 7434 // Find if I uses a predicated instruction. If so, it will use its scalar 7435 // value. Avoid hoisting the insert-element which packs the scalar value into 7436 // a vector value, as that happens iff all users use the vector value. 7437 for (auto &Op : I->operands()) 7438 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7439 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7440 PredInst2Recipe[PredInst]->setAlsoPack(false); 7441 7442 // Finalize the recipe for Instr, first if it is not predicated. 7443 if (!IsPredicated) { 7444 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7445 VPBB->appendRecipe(Recipe); 7446 return VPBB; 7447 } 7448 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7449 assert(VPBB->getSuccessors().empty() && 7450 "VPBB has successors when handling predicated replication."); 7451 // Record predicated instructions for above packing optimizations. 7452 PredInst2Recipe[I] = Recipe; 7453 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7454 VPBlockUtils::insertBlockAfter(Region, VPBB); 7455 auto *RegSucc = new VPBasicBlock(); 7456 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7457 return RegSucc; 7458 } 7459 7460 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7461 VPRecipeBase *PredRecipe, 7462 VPlanPtr &Plan) { 7463 // Instructions marked for predication are replicated and placed under an 7464 // if-then construct to prevent side-effects. 7465 7466 // Generate recipes to compute the block mask for this region. 7467 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7468 7469 // Build the triangular if-then region. 7470 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7471 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7472 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7473 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7474 auto *PHIRecipe = 7475 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7476 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7477 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7478 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7479 7480 // Note: first set Entry as region entry and then connect successors starting 7481 // from it in order, to propagate the "parent" of each VPBasicBlock. 7482 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7483 VPBlockUtils::connectBlocks(Pred, Exit); 7484 7485 return Region; 7486 } 7487 7488 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7489 VFRange &Range, 7490 VPlanPtr &Plan) { 7491 // First, check for specific widening recipes that deal with calls, memory 7492 // operations, inductions and Phi nodes. 7493 if (auto *CI = dyn_cast<CallInst>(Instr)) 7494 return tryToWidenCall(CI, Range, *Plan); 7495 7496 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7497 return tryToWidenMemory(Instr, Range, Plan); 7498 7499 VPRecipeBase *Recipe; 7500 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7501 if (Phi->getParent() != OrigLoop->getHeader()) 7502 return tryToBlend(Phi, Plan); 7503 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7504 return Recipe; 7505 return new VPWidenPHIRecipe(Phi); 7506 } 7507 7508 if (isa<TruncInst>(Instr) && 7509 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7510 return Recipe; 7511 7512 if (!shouldWiden(Instr, Range)) 7513 return nullptr; 7514 7515 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7516 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7517 OrigLoop); 7518 7519 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7520 bool InvariantCond = 7521 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7522 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7523 InvariantCond); 7524 } 7525 7526 return tryToWiden(Instr, *Plan); 7527 } 7528 7529 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7530 unsigned MaxVF) { 7531 assert(OrigLoop->empty() && "Inner loop expected."); 7532 7533 // Collect conditions feeding internal conditional branches; they need to be 7534 // represented in VPlan for it to model masking. 7535 SmallPtrSet<Value *, 1> NeedDef; 7536 7537 auto *Latch = OrigLoop->getLoopLatch(); 7538 for (BasicBlock *BB : OrigLoop->blocks()) { 7539 if (BB == Latch) 7540 continue; 7541 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7542 if (Branch && Branch->isConditional()) 7543 NeedDef.insert(Branch->getCondition()); 7544 } 7545 7546 // If the tail is to be folded by masking, the primary induction variable, if 7547 // exists needs to be represented in VPlan for it to model early-exit masking. 7548 // Also, both the Phi and the live-out instruction of each reduction are 7549 // required in order to introduce a select between them in VPlan. 7550 if (CM.foldTailByMasking()) { 7551 if (Legal->getPrimaryInduction()) 7552 NeedDef.insert(Legal->getPrimaryInduction()); 7553 for (auto &Reduction : Legal->getReductionVars()) { 7554 NeedDef.insert(Reduction.first); 7555 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7556 } 7557 } 7558 7559 // Collect instructions from the original loop that will become trivially dead 7560 // in the vectorized loop. We don't need to vectorize these instructions. For 7561 // example, original induction update instructions can become dead because we 7562 // separately emit induction "steps" when generating code for the new loop. 7563 // Similarly, we create a new latch condition when setting up the structure 7564 // of the new loop, so the old one can become dead. 7565 SmallPtrSet<Instruction *, 4> DeadInstructions; 7566 collectTriviallyDeadInstructions(DeadInstructions); 7567 7568 // Add assume instructions we need to drop to DeadInstructions, to prevent 7569 // them from being added to the VPlan. 7570 // TODO: We only need to drop assumes in blocks that get flattend. If the 7571 // control flow is preserved, we should keep them. 7572 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7573 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7574 7575 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7576 // Dead instructions do not need sinking. Remove them from SinkAfter. 7577 for (Instruction *I : DeadInstructions) 7578 SinkAfter.erase(I); 7579 7580 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7581 VFRange SubRange = {VF, MaxVF + 1}; 7582 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7583 DeadInstructions, SinkAfter)); 7584 VF = SubRange.End; 7585 } 7586 } 7587 7588 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7589 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7590 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7591 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7592 7593 // Hold a mapping from predicated instructions to their recipes, in order to 7594 // fix their AlsoPack behavior if a user is determined to replicate and use a 7595 // scalar instead of vector value. 7596 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7597 7598 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7599 7600 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7601 7602 // --------------------------------------------------------------------------- 7603 // Pre-construction: record ingredients whose recipes we'll need to further 7604 // process after constructing the initial VPlan. 7605 // --------------------------------------------------------------------------- 7606 7607 // Mark instructions we'll need to sink later and their targets as 7608 // ingredients whose recipe we'll need to record. 7609 for (auto &Entry : SinkAfter) { 7610 RecipeBuilder.recordRecipeOf(Entry.first); 7611 RecipeBuilder.recordRecipeOf(Entry.second); 7612 } 7613 for (auto &Reduction : CM.getInLoopReductionChains()) { 7614 PHINode *Phi = Reduction.first; 7615 RecurrenceDescriptor::RecurrenceKind Kind = 7616 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7617 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7618 7619 RecipeBuilder.recordRecipeOf(Phi); 7620 for (auto &R : ReductionOperations) { 7621 RecipeBuilder.recordRecipeOf(R); 7622 // For min/max reducitons, where we have a pair of icmp/select, we also 7623 // need to record the ICmp recipe, so it can be removed later. 7624 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7625 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7626 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7627 } 7628 } 7629 } 7630 7631 // For each interleave group which is relevant for this (possibly trimmed) 7632 // Range, add it to the set of groups to be later applied to the VPlan and add 7633 // placeholders for its members' Recipes which we'll be replacing with a 7634 // single VPInterleaveRecipe. 7635 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7636 auto applyIG = [IG, this](ElementCount VF) -> bool { 7637 return (VF.isVector() && // Query is illegal for VF == 1 7638 CM.getWideningDecision(IG->getInsertPos(), VF) == 7639 LoopVectorizationCostModel::CM_Interleave); 7640 }; 7641 if (!getDecisionAndClampRange(applyIG, Range)) 7642 continue; 7643 InterleaveGroups.insert(IG); 7644 for (unsigned i = 0; i < IG->getFactor(); i++) 7645 if (Instruction *Member = IG->getMember(i)) 7646 RecipeBuilder.recordRecipeOf(Member); 7647 }; 7648 7649 // --------------------------------------------------------------------------- 7650 // Build initial VPlan: Scan the body of the loop in a topological order to 7651 // visit each basic block after having visited its predecessor basic blocks. 7652 // --------------------------------------------------------------------------- 7653 7654 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7655 auto Plan = std::make_unique<VPlan>(); 7656 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7657 Plan->setEntry(VPBB); 7658 7659 // Represent values that will have defs inside VPlan. 7660 for (Value *V : NeedDef) 7661 Plan->addVPValue(V); 7662 7663 // Scan the body of the loop in a topological order to visit each basic block 7664 // after having visited its predecessor basic blocks. 7665 LoopBlocksDFS DFS(OrigLoop); 7666 DFS.perform(LI); 7667 7668 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7669 // Relevant instructions from basic block BB will be grouped into VPRecipe 7670 // ingredients and fill a new VPBasicBlock. 7671 unsigned VPBBsForBB = 0; 7672 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7673 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7674 VPBB = FirstVPBBForBB; 7675 Builder.setInsertPoint(VPBB); 7676 7677 // Introduce each ingredient into VPlan. 7678 // TODO: Model and preserve debug instrinsics in VPlan. 7679 for (Instruction &I : BB->instructionsWithoutDebug()) { 7680 Instruction *Instr = &I; 7681 7682 // First filter out irrelevant instructions, to ensure no recipes are 7683 // built for them. 7684 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7685 continue; 7686 7687 if (auto Recipe = 7688 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7689 RecipeBuilder.setRecipe(Instr, Recipe); 7690 VPBB->appendRecipe(Recipe); 7691 continue; 7692 } 7693 7694 // Otherwise, if all widening options failed, Instruction is to be 7695 // replicated. This may create a successor for VPBB. 7696 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7697 Instr, Range, VPBB, PredInst2Recipe, Plan); 7698 if (NextVPBB != VPBB) { 7699 VPBB = NextVPBB; 7700 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7701 : ""); 7702 } 7703 } 7704 } 7705 7706 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7707 // may also be empty, such as the last one VPBB, reflecting original 7708 // basic-blocks with no recipes. 7709 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7710 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7711 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7712 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7713 delete PreEntry; 7714 7715 // --------------------------------------------------------------------------- 7716 // Transform initial VPlan: Apply previously taken decisions, in order, to 7717 // bring the VPlan to its final state. 7718 // --------------------------------------------------------------------------- 7719 7720 // Apply Sink-After legal constraints. 7721 for (auto &Entry : SinkAfter) { 7722 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7723 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7724 Sink->moveAfter(Target); 7725 } 7726 7727 // Interleave memory: for each Interleave Group we marked earlier as relevant 7728 // for this VPlan, replace the Recipes widening its memory instructions with a 7729 // single VPInterleaveRecipe at its insertion point. 7730 for (auto IG : InterleaveGroups) { 7731 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7732 RecipeBuilder.getRecipe(IG->getInsertPos())); 7733 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7734 ->insertBefore(Recipe); 7735 7736 for (unsigned i = 0; i < IG->getFactor(); ++i) 7737 if (Instruction *Member = IG->getMember(i)) { 7738 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7739 } 7740 } 7741 7742 // Adjust the recipes for any inloop reductions. 7743 if (Range.Start > 1) 7744 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7745 7746 // Finally, if tail is folded by masking, introduce selects between the phi 7747 // and the live-out instruction of each reduction, at the end of the latch. 7748 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7749 Builder.setInsertPoint(VPBB); 7750 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7751 for (auto &Reduction : Legal->getReductionVars()) { 7752 assert(!CM.isInLoopReduction(Reduction.first) && 7753 "Didn't expect inloop tail folded reduction yet!"); 7754 VPValue *Phi = Plan->getVPValue(Reduction.first); 7755 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7756 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7757 } 7758 } 7759 7760 std::string PlanName; 7761 raw_string_ostream RSO(PlanName); 7762 ElementCount VF = ElementCount::getFixed(Range.Start); 7763 Plan->addVF(VF); 7764 RSO << "Initial VPlan for VF={" << VF; 7765 for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { 7766 Plan->addVF(VF); 7767 RSO << "," << VF; 7768 } 7769 RSO << "},UF>=1"; 7770 RSO.flush(); 7771 Plan->setName(PlanName); 7772 7773 return Plan; 7774 } 7775 7776 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7777 // Outer loop handling: They may require CFG and instruction level 7778 // transformations before even evaluating whether vectorization is profitable. 7779 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7780 // the vectorization pipeline. 7781 assert(!OrigLoop->empty()); 7782 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7783 7784 // Create new empty VPlan 7785 auto Plan = std::make_unique<VPlan>(); 7786 7787 // Build hierarchical CFG 7788 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7789 HCFGBuilder.buildHierarchicalCFG(); 7790 7791 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7792 Plan->addVF(ElementCount::getFixed(VF)); 7793 7794 if (EnableVPlanPredication) { 7795 VPlanPredicator VPP(*Plan); 7796 VPP.predicate(); 7797 7798 // Avoid running transformation to recipes until masked code generation in 7799 // VPlan-native path is in place. 7800 return Plan; 7801 } 7802 7803 SmallPtrSet<Instruction *, 1> DeadInstructions; 7804 VPlanTransforms::VPInstructionsToVPRecipes( 7805 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7806 return Plan; 7807 } 7808 7809 // Adjust the recipes for any inloop reductions. The chain of instructions 7810 // leading from the loop exit instr to the phi need to be converted to 7811 // reductions, with one operand being vector and the other being the scalar 7812 // reduction chain. 7813 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7814 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7815 for (auto &Reduction : CM.getInLoopReductionChains()) { 7816 PHINode *Phi = Reduction.first; 7817 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7818 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7819 7820 // ReductionOperations are orders top-down from the phi's use to the 7821 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7822 // which of the two operands will remain scalar and which will be reduced. 7823 // For minmax the chain will be the select instructions. 7824 Instruction *Chain = Phi; 7825 for (Instruction *R : ReductionOperations) { 7826 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7827 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7828 7829 VPValue *ChainOp = Plan->getVPValue(Chain); 7830 unsigned FirstOpId; 7831 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7832 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7833 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7834 "Expected to replace a VPWidenSelectSC"); 7835 FirstOpId = 1; 7836 } else { 7837 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7838 "Expected to replace a VPWidenSC"); 7839 FirstOpId = 0; 7840 } 7841 unsigned VecOpId = 7842 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7843 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7844 7845 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7846 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7847 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7848 WidenRecipe->eraseFromParent(); 7849 7850 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7851 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7852 VPRecipeBase *CompareRecipe = 7853 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7854 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7855 "Expected to replace a VPWidenSC"); 7856 CompareRecipe->eraseFromParent(); 7857 } 7858 Chain = R; 7859 } 7860 } 7861 } 7862 7863 Value* LoopVectorizationPlanner::VPCallbackILV:: 7864 getOrCreateVectorValues(Value *V, unsigned Part) { 7865 return ILV.getOrCreateVectorValue(V, Part); 7866 } 7867 7868 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7869 Value *V, const VPIteration &Instance) { 7870 return ILV.getOrCreateScalarValue(V, Instance); 7871 } 7872 7873 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7874 VPSlotTracker &SlotTracker) const { 7875 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7876 IG->getInsertPos()->printAsOperand(O, false); 7877 O << ", "; 7878 getAddr()->printAsOperand(O, SlotTracker); 7879 VPValue *Mask = getMask(); 7880 if (Mask) { 7881 O << ", "; 7882 Mask->printAsOperand(O, SlotTracker); 7883 } 7884 for (unsigned i = 0; i < IG->getFactor(); ++i) 7885 if (Instruction *I = IG->getMember(i)) 7886 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7887 } 7888 7889 void VPWidenCallRecipe::execute(VPTransformState &State) { 7890 State.ILV->widenCallInstruction(Ingredient, User, State); 7891 } 7892 7893 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7894 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7895 } 7896 7897 void VPWidenRecipe::execute(VPTransformState &State) { 7898 State.ILV->widenInstruction(Ingredient, User, State); 7899 } 7900 7901 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7902 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7903 IsIndexLoopInvariant, State); 7904 } 7905 7906 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7907 assert(!State.Instance && "Int or FP induction being replicated."); 7908 State.ILV->widenIntOrFpInduction(IV, Trunc); 7909 } 7910 7911 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7912 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7913 } 7914 7915 void VPBlendRecipe::execute(VPTransformState &State) { 7916 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7917 // We know that all PHIs in non-header blocks are converted into 7918 // selects, so we don't have to worry about the insertion order and we 7919 // can just use the builder. 7920 // At this point we generate the predication tree. There may be 7921 // duplications since this is a simple recursive scan, but future 7922 // optimizations will clean it up. 7923 7924 unsigned NumIncoming = getNumIncomingValues(); 7925 7926 // Generate a sequence of selects of the form: 7927 // SELECT(Mask3, In3, 7928 // SELECT(Mask2, In2, 7929 // SELECT(Mask1, In1, 7930 // In0))) 7931 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7932 // are essentially undef are taken from In0. 7933 InnerLoopVectorizer::VectorParts Entry(State.UF); 7934 for (unsigned In = 0; In < NumIncoming; ++In) { 7935 for (unsigned Part = 0; Part < State.UF; ++Part) { 7936 // We might have single edge PHIs (blocks) - use an identity 7937 // 'select' for the first PHI operand. 7938 Value *In0 = State.get(getIncomingValue(In), Part); 7939 if (In == 0) 7940 Entry[Part] = In0; // Initialize with the first incoming value. 7941 else { 7942 // Select between the current value and the previous incoming edge 7943 // based on the incoming mask. 7944 Value *Cond = State.get(getMask(In), Part); 7945 Entry[Part] = 7946 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7947 } 7948 } 7949 } 7950 for (unsigned Part = 0; Part < State.UF; ++Part) 7951 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7952 } 7953 7954 void VPInterleaveRecipe::execute(VPTransformState &State) { 7955 assert(!State.Instance && "Interleave group being replicated."); 7956 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7957 } 7958 7959 void VPReductionRecipe::execute(VPTransformState &State) { 7960 assert(!State.Instance && "Reduction being replicated."); 7961 for (unsigned Part = 0; Part < State.UF; ++Part) { 7962 unsigned Kind = RdxDesc->getRecurrenceKind(); 7963 Value *NewVecOp = State.get(VecOp, Part); 7964 Value *NewRed = 7965 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7966 Value *PrevInChain = State.get(ChainOp, Part); 7967 Value *NextInChain; 7968 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7969 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7970 NextInChain = 7971 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 7972 NewRed, PrevInChain); 7973 } else { 7974 NextInChain = State.Builder.CreateBinOp( 7975 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 7976 } 7977 State.ValueMap.setVectorValue(I, Part, NextInChain); 7978 } 7979 } 7980 7981 void VPReplicateRecipe::execute(VPTransformState &State) { 7982 if (State.Instance) { // Generate a single instance. 7983 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 7984 IsPredicated, State); 7985 // Insert scalar instance packing it into a vector. 7986 if (AlsoPack && State.VF.isVector()) { 7987 // If we're constructing lane 0, initialize to start from undef. 7988 if (State.Instance->Lane == 0) { 7989 assert(!State.VF.Scalable && "VF is assumed to be non scalable."); 7990 Value *Undef = 7991 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7992 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7993 } 7994 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7995 } 7996 return; 7997 } 7998 7999 // Generate scalar instances for all VF lanes of all UF parts, unless the 8000 // instruction is uniform inwhich case generate only the first lane for each 8001 // of the UF parts. 8002 unsigned EndLane = IsUniform ? 1 : State.VF.Min; 8003 for (unsigned Part = 0; Part < State.UF; ++Part) 8004 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8005 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 8006 IsPredicated, State); 8007 } 8008 8009 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8010 assert(State.Instance && "Branch on Mask works only on single instance."); 8011 8012 unsigned Part = State.Instance->Part; 8013 unsigned Lane = State.Instance->Lane; 8014 8015 Value *ConditionBit = nullptr; 8016 VPValue *BlockInMask = getMask(); 8017 if (BlockInMask) { 8018 ConditionBit = State.get(BlockInMask, Part); 8019 if (ConditionBit->getType()->isVectorTy()) 8020 ConditionBit = State.Builder.CreateExtractElement( 8021 ConditionBit, State.Builder.getInt32(Lane)); 8022 } else // Block in mask is all-one. 8023 ConditionBit = State.Builder.getTrue(); 8024 8025 // Replace the temporary unreachable terminator with a new conditional branch, 8026 // whose two destinations will be set later when they are created. 8027 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8028 assert(isa<UnreachableInst>(CurrentTerminator) && 8029 "Expected to replace unreachable terminator with conditional branch."); 8030 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8031 CondBr->setSuccessor(0, nullptr); 8032 ReplaceInstWithInst(CurrentTerminator, CondBr); 8033 } 8034 8035 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8036 assert(State.Instance && "Predicated instruction PHI works per instance."); 8037 Instruction *ScalarPredInst = cast<Instruction>( 8038 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8039 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8040 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8041 assert(PredicatingBB && "Predicated block has no single predecessor."); 8042 8043 // By current pack/unpack logic we need to generate only a single phi node: if 8044 // a vector value for the predicated instruction exists at this point it means 8045 // the instruction has vector users only, and a phi for the vector value is 8046 // needed. In this case the recipe of the predicated instruction is marked to 8047 // also do that packing, thereby "hoisting" the insert-element sequence. 8048 // Otherwise, a phi node for the scalar value is needed. 8049 unsigned Part = State.Instance->Part; 8050 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8051 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8052 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8053 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8054 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8055 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8056 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8057 } else { 8058 Type *PredInstType = PredInst->getType(); 8059 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8060 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8061 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8062 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8063 } 8064 } 8065 8066 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8067 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8068 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8069 getMask()); 8070 } 8071 8072 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8073 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8074 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8075 // for predication. 8076 static ScalarEpilogueLowering getScalarEpilogueLowering( 8077 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8078 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8079 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8080 LoopVectorizationLegality &LVL) { 8081 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8082 // don't look at hints or options, and don't request a scalar epilogue. 8083 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8084 // LoopAccessInfo (due to code dependency and not being able to reliably get 8085 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8086 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8087 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8088 // back to the old way and vectorize with versioning when forced. See D81345.) 8089 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8090 PGSOQueryType::IRPass) && 8091 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8092 return CM_ScalarEpilogueNotAllowedOptSize; 8093 8094 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8095 !PreferPredicateOverEpilogue; 8096 8097 // 2) Next, if disabling predication is requested on the command line, honour 8098 // this and request a scalar epilogue. 8099 if (PredicateOptDisabled) 8100 return CM_ScalarEpilogueAllowed; 8101 8102 // 3) and 4) look if enabling predication is requested on the command line, 8103 // with a loop hint, or if the TTI hook indicates this is profitable, request 8104 // predication. 8105 if (PreferPredicateOverEpilogue || 8106 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8107 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8108 LVL.getLAI()) && 8109 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8110 return CM_ScalarEpilogueNotNeededUsePredicate; 8111 8112 return CM_ScalarEpilogueAllowed; 8113 } 8114 8115 // Process the loop in the VPlan-native vectorization path. This path builds 8116 // VPlan upfront in the vectorization pipeline, which allows to apply 8117 // VPlan-to-VPlan transformations from the very beginning without modifying the 8118 // input LLVM IR. 8119 static bool processLoopInVPlanNativePath( 8120 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8121 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8122 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8123 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8124 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8125 8126 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8127 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8128 return false; 8129 } 8130 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8131 Function *F = L->getHeader()->getParent(); 8132 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8133 8134 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8135 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8136 8137 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8138 &Hints, IAI); 8139 // Use the planner for outer loop vectorization. 8140 // TODO: CM is not used at this point inside the planner. Turn CM into an 8141 // optional argument if we don't need it in the future. 8142 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8143 8144 // Get user vectorization factor. 8145 const unsigned UserVF = Hints.getWidth(); 8146 8147 // Plan how to best vectorize, return the best VF and its cost. 8148 const VectorizationFactor VF = 8149 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8150 8151 // If we are stress testing VPlan builds, do not attempt to generate vector 8152 // code. Masked vector code generation support will follow soon. 8153 // Also, do not attempt to vectorize if no vector code will be produced. 8154 if (VPlanBuildStressTest || EnableVPlanPredication || 8155 VectorizationFactor::Disabled() == VF) 8156 return false; 8157 8158 LVP.setBestPlan(VF.Width, 1); 8159 8160 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8161 &CM, BFI, PSI); 8162 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8163 << L->getHeader()->getParent()->getName() << "\"\n"); 8164 LVP.executePlan(LB, DT); 8165 8166 // Mark the loop as already vectorized to avoid vectorizing again. 8167 Hints.setAlreadyVectorized(); 8168 8169 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8170 return true; 8171 } 8172 8173 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8174 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8175 !EnableLoopInterleaving), 8176 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8177 !EnableLoopVectorization) {} 8178 8179 bool LoopVectorizePass::processLoop(Loop *L) { 8180 assert((EnableVPlanNativePath || L->empty()) && 8181 "VPlan-native path is not enabled. Only process inner loops."); 8182 8183 #ifndef NDEBUG 8184 const std::string DebugLocStr = getDebugLocString(L); 8185 #endif /* NDEBUG */ 8186 8187 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8188 << L->getHeader()->getParent()->getName() << "\" from " 8189 << DebugLocStr << "\n"); 8190 8191 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8192 8193 LLVM_DEBUG( 8194 dbgs() << "LV: Loop hints:" 8195 << " force=" 8196 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8197 ? "disabled" 8198 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8199 ? "enabled" 8200 : "?")) 8201 << " width=" << Hints.getWidth() 8202 << " unroll=" << Hints.getInterleave() << "\n"); 8203 8204 // Function containing loop 8205 Function *F = L->getHeader()->getParent(); 8206 8207 // Looking at the diagnostic output is the only way to determine if a loop 8208 // was vectorized (other than looking at the IR or machine code), so it 8209 // is important to generate an optimization remark for each loop. Most of 8210 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8211 // generated as OptimizationRemark and OptimizationRemarkMissed are 8212 // less verbose reporting vectorized loops and unvectorized loops that may 8213 // benefit from vectorization, respectively. 8214 8215 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8216 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8217 return false; 8218 } 8219 8220 PredicatedScalarEvolution PSE(*SE, *L); 8221 8222 // Check if it is legal to vectorize the loop. 8223 LoopVectorizationRequirements Requirements(*ORE); 8224 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8225 &Requirements, &Hints, DB, AC, BFI, PSI); 8226 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8227 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8228 Hints.emitRemarkWithHints(); 8229 return false; 8230 } 8231 8232 // Check the function attributes and profiles to find out if this function 8233 // should be optimized for size. 8234 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8235 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8236 8237 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8238 // here. They may require CFG and instruction level transformations before 8239 // even evaluating whether vectorization is profitable. Since we cannot modify 8240 // the incoming IR, we need to build VPlan upfront in the vectorization 8241 // pipeline. 8242 if (!L->empty()) 8243 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8244 ORE, BFI, PSI, Hints); 8245 8246 assert(L->empty() && "Inner loop expected."); 8247 8248 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8249 // count by optimizing for size, to minimize overheads. 8250 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8251 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8252 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8253 << "This loop is worth vectorizing only if no scalar " 8254 << "iteration overheads are incurred."); 8255 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8256 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8257 else { 8258 LLVM_DEBUG(dbgs() << "\n"); 8259 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8260 } 8261 } 8262 8263 // Check the function attributes to see if implicit floats are allowed. 8264 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8265 // an integer loop and the vector instructions selected are purely integer 8266 // vector instructions? 8267 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8268 reportVectorizationFailure( 8269 "Can't vectorize when the NoImplicitFloat attribute is used", 8270 "loop not vectorized due to NoImplicitFloat attribute", 8271 "NoImplicitFloat", ORE, L); 8272 Hints.emitRemarkWithHints(); 8273 return false; 8274 } 8275 8276 // Check if the target supports potentially unsafe FP vectorization. 8277 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8278 // for the target we're vectorizing for, to make sure none of the 8279 // additional fp-math flags can help. 8280 if (Hints.isPotentiallyUnsafe() && 8281 TTI->isFPVectorizationPotentiallyUnsafe()) { 8282 reportVectorizationFailure( 8283 "Potentially unsafe FP op prevents vectorization", 8284 "loop not vectorized due to unsafe FP support.", 8285 "UnsafeFP", ORE, L); 8286 Hints.emitRemarkWithHints(); 8287 return false; 8288 } 8289 8290 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8291 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8292 8293 // If an override option has been passed in for interleaved accesses, use it. 8294 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8295 UseInterleaved = EnableInterleavedMemAccesses; 8296 8297 // Analyze interleaved memory accesses. 8298 if (UseInterleaved) { 8299 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8300 } 8301 8302 // Use the cost model. 8303 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8304 F, &Hints, IAI); 8305 CM.collectValuesToIgnore(); 8306 8307 // Use the planner for vectorization. 8308 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8309 8310 // Get user vectorization factor and interleave count. 8311 unsigned UserVF = Hints.getWidth(); 8312 unsigned UserIC = Hints.getInterleave(); 8313 8314 // Plan how to best vectorize, return the best VF and its cost. 8315 Optional<VectorizationFactor> MaybeVF = 8316 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8317 8318 VectorizationFactor VF = VectorizationFactor::Disabled(); 8319 unsigned IC = 1; 8320 8321 if (MaybeVF) { 8322 VF = *MaybeVF; 8323 // Select the interleave count. 8324 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8325 } 8326 8327 // Identify the diagnostic messages that should be produced. 8328 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8329 bool VectorizeLoop = true, InterleaveLoop = true; 8330 if (Requirements.doesNotMeet(F, L, Hints)) { 8331 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8332 "requirements.\n"); 8333 Hints.emitRemarkWithHints(); 8334 return false; 8335 } 8336 8337 if (VF.Width == 1) { 8338 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8339 VecDiagMsg = std::make_pair( 8340 "VectorizationNotBeneficial", 8341 "the cost-model indicates that vectorization is not beneficial"); 8342 VectorizeLoop = false; 8343 } 8344 8345 if (!MaybeVF && UserIC > 1) { 8346 // Tell the user interleaving was avoided up-front, despite being explicitly 8347 // requested. 8348 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8349 "interleaving should be avoided up front\n"); 8350 IntDiagMsg = std::make_pair( 8351 "InterleavingAvoided", 8352 "Ignoring UserIC, because interleaving was avoided up front"); 8353 InterleaveLoop = false; 8354 } else if (IC == 1 && UserIC <= 1) { 8355 // Tell the user interleaving is not beneficial. 8356 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8357 IntDiagMsg = std::make_pair( 8358 "InterleavingNotBeneficial", 8359 "the cost-model indicates that interleaving is not beneficial"); 8360 InterleaveLoop = false; 8361 if (UserIC == 1) { 8362 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8363 IntDiagMsg.second += 8364 " and is explicitly disabled or interleave count is set to 1"; 8365 } 8366 } else if (IC > 1 && UserIC == 1) { 8367 // Tell the user interleaving is beneficial, but it explicitly disabled. 8368 LLVM_DEBUG( 8369 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8370 IntDiagMsg = std::make_pair( 8371 "InterleavingBeneficialButDisabled", 8372 "the cost-model indicates that interleaving is beneficial " 8373 "but is explicitly disabled or interleave count is set to 1"); 8374 InterleaveLoop = false; 8375 } 8376 8377 // Override IC if user provided an interleave count. 8378 IC = UserIC > 0 ? UserIC : IC; 8379 8380 // Emit diagnostic messages, if any. 8381 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8382 if (!VectorizeLoop && !InterleaveLoop) { 8383 // Do not vectorize or interleaving the loop. 8384 ORE->emit([&]() { 8385 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8386 L->getStartLoc(), L->getHeader()) 8387 << VecDiagMsg.second; 8388 }); 8389 ORE->emit([&]() { 8390 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8391 L->getStartLoc(), L->getHeader()) 8392 << IntDiagMsg.second; 8393 }); 8394 return false; 8395 } else if (!VectorizeLoop && InterleaveLoop) { 8396 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8397 ORE->emit([&]() { 8398 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8399 L->getStartLoc(), L->getHeader()) 8400 << VecDiagMsg.second; 8401 }); 8402 } else if (VectorizeLoop && !InterleaveLoop) { 8403 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8404 << ") in " << DebugLocStr << '\n'); 8405 ORE->emit([&]() { 8406 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8407 L->getStartLoc(), L->getHeader()) 8408 << IntDiagMsg.second; 8409 }); 8410 } else if (VectorizeLoop && InterleaveLoop) { 8411 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8412 << ") in " << DebugLocStr << '\n'); 8413 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8414 } 8415 8416 LVP.setBestPlan(VF.Width, IC); 8417 8418 using namespace ore; 8419 bool DisableRuntimeUnroll = false; 8420 MDNode *OrigLoopID = L->getLoopID(); 8421 8422 if (!VectorizeLoop) { 8423 assert(IC > 1 && "interleave count should not be 1 or 0"); 8424 // If we decided that it is not legal to vectorize the loop, then 8425 // interleave it. 8426 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8427 BFI, PSI); 8428 LVP.executePlan(Unroller, DT); 8429 8430 ORE->emit([&]() { 8431 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8432 L->getHeader()) 8433 << "interleaved loop (interleaved count: " 8434 << NV("InterleaveCount", IC) << ")"; 8435 }); 8436 } else { 8437 // If we decided that it is *legal* to vectorize the loop, then do it. 8438 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8439 &LVL, &CM, BFI, PSI); 8440 LVP.executePlan(LB, DT); 8441 ++LoopsVectorized; 8442 8443 // Add metadata to disable runtime unrolling a scalar loop when there are 8444 // no runtime checks about strides and memory. A scalar loop that is 8445 // rarely used is not worth unrolling. 8446 if (!LB.areSafetyChecksAdded()) 8447 DisableRuntimeUnroll = true; 8448 8449 // Report the vectorization decision. 8450 ORE->emit([&]() { 8451 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8452 L->getHeader()) 8453 << "vectorized loop (vectorization width: " 8454 << NV("VectorizationFactor", VF.Width) 8455 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8456 }); 8457 } 8458 8459 Optional<MDNode *> RemainderLoopID = 8460 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8461 LLVMLoopVectorizeFollowupEpilogue}); 8462 if (RemainderLoopID.hasValue()) { 8463 L->setLoopID(RemainderLoopID.getValue()); 8464 } else { 8465 if (DisableRuntimeUnroll) 8466 AddRuntimeUnrollDisableMetaData(L); 8467 8468 // Mark the loop as already vectorized to avoid vectorizing again. 8469 Hints.setAlreadyVectorized(); 8470 } 8471 8472 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8473 return true; 8474 } 8475 8476 LoopVectorizeResult LoopVectorizePass::runImpl( 8477 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8478 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8479 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8480 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8481 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8482 SE = &SE_; 8483 LI = &LI_; 8484 TTI = &TTI_; 8485 DT = &DT_; 8486 BFI = &BFI_; 8487 TLI = TLI_; 8488 AA = &AA_; 8489 AC = &AC_; 8490 GetLAA = &GetLAA_; 8491 DB = &DB_; 8492 ORE = &ORE_; 8493 PSI = PSI_; 8494 8495 // Don't attempt if 8496 // 1. the target claims to have no vector registers, and 8497 // 2. interleaving won't help ILP. 8498 // 8499 // The second condition is necessary because, even if the target has no 8500 // vector registers, loop vectorization may still enable scalar 8501 // interleaving. 8502 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8503 TTI->getMaxInterleaveFactor(1) < 2) 8504 return LoopVectorizeResult(false, false); 8505 8506 bool Changed = false, CFGChanged = false; 8507 8508 // The vectorizer requires loops to be in simplified form. 8509 // Since simplification may add new inner loops, it has to run before the 8510 // legality and profitability checks. This means running the loop vectorizer 8511 // will simplify all loops, regardless of whether anything end up being 8512 // vectorized. 8513 for (auto &L : *LI) 8514 Changed |= CFGChanged |= 8515 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8516 8517 // Build up a worklist of inner-loops to vectorize. This is necessary as 8518 // the act of vectorizing or partially unrolling a loop creates new loops 8519 // and can invalidate iterators across the loops. 8520 SmallVector<Loop *, 8> Worklist; 8521 8522 for (Loop *L : *LI) 8523 collectSupportedLoops(*L, LI, ORE, Worklist); 8524 8525 LoopsAnalyzed += Worklist.size(); 8526 8527 // Now walk the identified inner loops. 8528 while (!Worklist.empty()) { 8529 Loop *L = Worklist.pop_back_val(); 8530 8531 // For the inner loops we actually process, form LCSSA to simplify the 8532 // transform. 8533 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8534 8535 Changed |= CFGChanged |= processLoop(L); 8536 } 8537 8538 // Process each loop nest in the function. 8539 return LoopVectorizeResult(Changed, CFGChanged); 8540 } 8541 8542 PreservedAnalyses LoopVectorizePass::run(Function &F, 8543 FunctionAnalysisManager &AM) { 8544 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8545 auto &LI = AM.getResult<LoopAnalysis>(F); 8546 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8547 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8548 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8549 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8550 auto &AA = AM.getResult<AAManager>(F); 8551 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8552 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8553 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8554 MemorySSA *MSSA = EnableMSSALoopDependency 8555 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8556 : nullptr; 8557 8558 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8559 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8560 [&](Loop &L) -> const LoopAccessInfo & { 8561 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8562 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8563 }; 8564 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8565 ProfileSummaryInfo *PSI = 8566 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8567 LoopVectorizeResult Result = 8568 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8569 if (!Result.MadeAnyChange) 8570 return PreservedAnalyses::all(); 8571 PreservedAnalyses PA; 8572 8573 // We currently do not preserve loopinfo/dominator analyses with outer loop 8574 // vectorization. Until this is addressed, mark these analyses as preserved 8575 // only for non-VPlan-native path. 8576 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8577 if (!EnableVPlanNativePath) { 8578 PA.preserve<LoopAnalysis>(); 8579 PA.preserve<DominatorTreeAnalysis>(); 8580 } 8581 PA.preserve<BasicAA>(); 8582 PA.preserve<GlobalsAA>(); 8583 if (!Result.MadeCFGChange) 8584 PA.preserveSet<CFGAnalyses>(); 8585 return PA; 8586 } 8587