1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// The number of stores in a loop that are allowed to need predication. 274 static cl::opt<unsigned> NumberOfStoresToPredicate( 275 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 276 cl::desc("Max number of stores to be predicated behind an if.")); 277 278 static cl::opt<bool> EnableIndVarRegisterHeur( 279 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 280 cl::desc("Count the induction variable only once when interleaving")); 281 282 static cl::opt<bool> EnableCondStoresVectorization( 283 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 284 cl::desc("Enable if predication of stores during vectorization.")); 285 286 static cl::opt<unsigned> MaxNestedScalarReductionIC( 287 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 288 cl::desc("The maximum interleave count to use when interleaving a scalar " 289 "reduction in a nested loop.")); 290 291 static cl::opt<bool> 292 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 293 cl::Hidden, 294 cl::desc("Prefer in-loop vector reductions, " 295 "overriding the targets preference.")); 296 297 static cl::opt<bool> PreferPredicatedReductionSelect( 298 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 299 cl::desc( 300 "Prefer predicating a reduction operation over an after loop select.")); 301 302 cl::opt<bool> EnableVPlanNativePath( 303 "enable-vplan-native-path", cl::init(false), cl::Hidden, 304 cl::desc("Enable VPlan-native vectorization path with " 305 "support for outer loop vectorization.")); 306 307 // FIXME: Remove this switch once we have divergence analysis. Currently we 308 // assume divergent non-backedge branches when this switch is true. 309 cl::opt<bool> EnableVPlanPredication( 310 "enable-vplan-predication", cl::init(false), cl::Hidden, 311 cl::desc("Enable VPlan-native vectorization path predicator with " 312 "support for outer loop vectorization.")); 313 314 // This flag enables the stress testing of the VPlan H-CFG construction in the 315 // VPlan-native vectorization path. It must be used in conjuction with 316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 317 // verification of the H-CFGs built. 318 static cl::opt<bool> VPlanBuildStressTest( 319 "vplan-build-stress-test", cl::init(false), cl::Hidden, 320 cl::desc( 321 "Build VPlan for every supported loop nest in the function and bail " 322 "out right after the build (stress test the VPlan H-CFG construction " 323 "in the VPlan-native vectorization path).")); 324 325 cl::opt<bool> llvm::EnableLoopInterleaving( 326 "interleave-loops", cl::init(true), cl::Hidden, 327 cl::desc("Enable loop interleaving in Loop vectorization passes")); 328 cl::opt<bool> llvm::EnableLoopVectorization( 329 "vectorize-loops", cl::init(true), cl::Hidden, 330 cl::desc("Run the Loop vectorization passes")); 331 332 /// A helper function that returns the type of loaded or stored value. 333 static Type *getMemInstValueType(Value *I) { 334 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 335 "Expected Load or Store instruction"); 336 if (auto *LI = dyn_cast<LoadInst>(I)) 337 return LI->getType(); 338 return cast<StoreInst>(I)->getValueOperand()->getType(); 339 } 340 341 /// A helper function that returns true if the given type is irregular. The 342 /// type is irregular if its allocated size doesn't equal the store size of an 343 /// element of the corresponding vector type at the given vectorization factor. 344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 345 assert(!VF.isScalable() && "scalable vectors not yet supported."); 346 // Determine if an array of VF elements of type Ty is "bitcast compatible" 347 // with a <VF x Ty> vector. 348 if (VF.isVector()) { 349 auto *VectorTy = VectorType::get(Ty, VF); 350 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 351 } 352 353 // If the vectorization factor is one, we just check if an array of type Ty 354 // requires padding between elements. 355 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 356 } 357 358 /// A helper function that returns the reciprocal of the block probability of 359 /// predicated blocks. If we return X, we are assuming the predicated block 360 /// will execute once for every X iterations of the loop header. 361 /// 362 /// TODO: We should use actual block probability here, if available. Currently, 363 /// we always assume predicated blocks have a 50% chance of executing. 364 static unsigned getReciprocalPredBlockProb() { return 2; } 365 366 /// A helper function that adds a 'fast' flag to floating-point operations. 367 static Value *addFastMathFlag(Value *V) { 368 if (isa<FPMathOperator>(V)) 369 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 370 return V; 371 } 372 373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 374 if (isa<FPMathOperator>(V)) 375 cast<Instruction>(V)->setFastMathFlags(FMF); 376 return V; 377 } 378 379 /// A helper function that returns an integer or floating-point constant with 380 /// value C. 381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 382 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 383 : ConstantFP::get(Ty, C); 384 } 385 386 /// Returns "best known" trip count for the specified loop \p L as defined by 387 /// the following procedure: 388 /// 1) Returns exact trip count if it is known. 389 /// 2) Returns expected trip count according to profile data if any. 390 /// 3) Returns upper bound estimate if it is known. 391 /// 4) Returns None if all of the above failed. 392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 393 // Check if exact trip count is known. 394 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 395 return ExpectedTC; 396 397 // Check if there is an expected trip count available from profile data. 398 if (LoopVectorizeWithBlockFrequency) 399 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 400 return EstimatedTC; 401 402 // Check if upper bound estimate is known. 403 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 404 return ExpectedTC; 405 406 return None; 407 } 408 409 namespace llvm { 410 411 /// InnerLoopVectorizer vectorizes loops which contain only one basic 412 /// block to a specified vectorization factor (VF). 413 /// This class performs the widening of scalars into vectors, or multiple 414 /// scalars. This class also implements the following features: 415 /// * It inserts an epilogue loop for handling loops that don't have iteration 416 /// counts that are known to be a multiple of the vectorization factor. 417 /// * It handles the code generation for reduction variables. 418 /// * Scalarization (implementation using scalars) of un-vectorizable 419 /// instructions. 420 /// InnerLoopVectorizer does not perform any vectorization-legality 421 /// checks, and relies on the caller to check for the different legality 422 /// aspects. The InnerLoopVectorizer relies on the 423 /// LoopVectorizationLegality class to provide information about the induction 424 /// and reduction variables that were found to a given vectorization factor. 425 class InnerLoopVectorizer { 426 public: 427 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 428 LoopInfo *LI, DominatorTree *DT, 429 const TargetLibraryInfo *TLI, 430 const TargetTransformInfo *TTI, AssumptionCache *AC, 431 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 432 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 433 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 434 ProfileSummaryInfo *PSI) 435 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 436 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 437 Builder(PSE.getSE()->getContext()), 438 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 439 BFI(BFI), PSI(PSI) { 440 // Query this against the original loop and save it here because the profile 441 // of the original loop header may change as the transformation happens. 442 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 443 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 444 } 445 446 virtual ~InnerLoopVectorizer() = default; 447 448 /// Create a new empty loop that will contain vectorized instructions later 449 /// on, while the old loop will be used as the scalar remainder. Control flow 450 /// is generated around the vectorized (and scalar epilogue) loops consisting 451 /// of various checks and bypasses. Return the pre-header block of the new 452 /// loop. 453 BasicBlock *createVectorizedLoopSkeleton(); 454 455 /// Widen a single instruction within the innermost loop. 456 void widenInstruction(Instruction &I, VPUser &Operands, 457 VPTransformState &State); 458 459 /// Widen a single call instruction within the innermost loop. 460 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 461 VPTransformState &State); 462 463 /// Widen a single select instruction within the innermost loop. 464 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 465 bool InvariantCond, VPTransformState &State); 466 467 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 468 void fixVectorizedLoop(); 469 470 // Return true if any runtime check is added. 471 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 472 473 /// A type for vectorized values in the new loop. Each value from the 474 /// original loop, when vectorized, is represented by UF vector values in the 475 /// new unrolled loop, where UF is the unroll factor. 476 using VectorParts = SmallVector<Value *, 2>; 477 478 /// Vectorize a single GetElementPtrInst based on information gathered and 479 /// decisions taken during planning. 480 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 481 ElementCount VF, bool IsPtrLoopInvariant, 482 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 483 484 /// Vectorize a single PHINode in a block. This method handles the induction 485 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 486 /// arbitrary length vectors. 487 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 488 489 /// A helper function to scalarize a single Instruction in the innermost loop. 490 /// Generates a sequence of scalar instances for each lane between \p MinLane 491 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 492 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 493 /// Instr's operands. 494 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 495 const VPIteration &Instance, bool IfPredicateInstr, 496 VPTransformState &State); 497 498 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 499 /// is provided, the integer induction variable will first be truncated to 500 /// the corresponding type. 501 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 502 503 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 504 /// vector or scalar value on-demand if one is not yet available. When 505 /// vectorizing a loop, we visit the definition of an instruction before its 506 /// uses. When visiting the definition, we either vectorize or scalarize the 507 /// instruction, creating an entry for it in the corresponding map. (In some 508 /// cases, such as induction variables, we will create both vector and scalar 509 /// entries.) Then, as we encounter uses of the definition, we derive values 510 /// for each scalar or vector use unless such a value is already available. 511 /// For example, if we scalarize a definition and one of its uses is vector, 512 /// we build the required vector on-demand with an insertelement sequence 513 /// when visiting the use. Otherwise, if the use is scalar, we can use the 514 /// existing scalar definition. 515 /// 516 /// Return a value in the new loop corresponding to \p V from the original 517 /// loop at unroll index \p Part. If the value has already been vectorized, 518 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 519 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 520 /// a new vector value on-demand by inserting the scalar values into a vector 521 /// with an insertelement sequence. If the value has been neither vectorized 522 /// nor scalarized, it must be loop invariant, so we simply broadcast the 523 /// value into a vector. 524 Value *getOrCreateVectorValue(Value *V, unsigned Part); 525 526 /// Return a value in the new loop corresponding to \p V from the original 527 /// loop at unroll and vector indices \p Instance. If the value has been 528 /// vectorized but not scalarized, the necessary extractelement instruction 529 /// will be generated. 530 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 531 532 /// Construct the vector value of a scalarized value \p V one lane at a time. 533 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 534 535 /// Try to vectorize interleaved access group \p Group with the base address 536 /// given in \p Addr, optionally masking the vector operations if \p 537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 538 /// values in the vectorized loop. 539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 540 VPTransformState &State, VPValue *Addr, 541 VPValue *BlockInMask = nullptr); 542 543 /// Vectorize Load and Store instructions with the base address given in \p 544 /// Addr, optionally masking the vector operations if \p BlockInMask is 545 /// non-null. Use \p State to translate given VPValues to IR values in the 546 /// vectorized loop. 547 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 548 VPValue *Addr, VPValue *StoredValue, 549 VPValue *BlockInMask); 550 551 /// Set the debug location in the builder using the debug location in 552 /// the instruction. 553 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(void); 557 558 protected: 559 friend class LoopVectorizationPlanner; 560 561 /// A small list of PHINodes. 562 using PhiVector = SmallVector<PHINode *, 4>; 563 564 /// A type for scalarized values in the new loop. Each value from the 565 /// original loop, when scalarized, is represented by UF x VF scalar values 566 /// in the new unrolled loop, where UF is the unroll factor and VF is the 567 /// vectorization factor. 568 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 569 570 /// Set up the values of the IVs correctly when exiting the vector loop. 571 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 572 Value *CountRoundDown, Value *EndValue, 573 BasicBlock *MiddleBlock); 574 575 /// Create a new induction variable inside L. 576 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 577 Value *Step, Instruction *DL); 578 579 /// Handle all cross-iteration phis in the header. 580 void fixCrossIterationPHIs(); 581 582 /// Fix a first-order recurrence. This is the second phase of vectorizing 583 /// this phi node. 584 void fixFirstOrderRecurrence(PHINode *Phi); 585 586 /// Fix a reduction cross-iteration phi. This is the second phase of 587 /// vectorizing this phi node. 588 void fixReduction(PHINode *Phi); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 592 593 /// The Loop exit block may have single value PHI nodes with some 594 /// incoming value. While vectorizing we only handled real values 595 /// that were defined inside the loop and we should have one value for 596 /// each predecessor of its parent basic block. See PR14725. 597 void fixLCSSAPHIs(); 598 599 /// Iteratively sink the scalarized operands of a predicated instruction into 600 /// the block that was created for it. 601 void sinkScalarOperands(Instruction *PredInst); 602 603 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 604 /// represented as. 605 void truncateToMinimalBitwidths(); 606 607 /// Create a broadcast instruction. This method generates a broadcast 608 /// instruction (shuffle) for loop invariant values and for the induction 609 /// value. If this is the induction variable then we extend it to N, N+1, ... 610 /// this is needed because each iteration in the loop corresponds to a SIMD 611 /// element. 612 virtual Value *getBroadcastInstrs(Value *V); 613 614 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID); 628 629 /// Create a vector induction phi node based on an existing scalar one. \p 630 /// EntryVal is the value from the original loop that maps to the vector phi 631 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 632 /// truncate instruction, instead of widening the original IV, we widen a 633 /// version of the IV truncated to \p EntryVal's type. 634 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 635 Value *Step, Instruction *EntryVal); 636 637 /// Returns true if an instruction \p I should be scalarized instead of 638 /// vectorized for the chosen vectorization factor. 639 bool shouldScalarizeInstruction(Instruction *I) const; 640 641 /// Returns true if we should generate a scalar version of \p IV. 642 bool needsScalarInduction(Instruction *IV) const; 643 644 /// If there is a cast involved in the induction variable \p ID, which should 645 /// be ignored in the vectorized loop body, this function records the 646 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 647 /// cast. We had already proved that the casted Phi is equal to the uncasted 648 /// Phi in the vectorized loop (under a runtime guard), and therefore 649 /// there is no need to vectorize the cast - the same value can be used in the 650 /// vector loop for both the Phi and the cast. 651 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 652 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 653 /// 654 /// \p EntryVal is the value from the original loop that maps to the vector 655 /// phi node and is used to distinguish what is the IV currently being 656 /// processed - original one (if \p EntryVal is a phi corresponding to the 657 /// original IV) or the "newly-created" one based on the proof mentioned above 658 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 659 /// latter case \p EntryVal is a TruncInst and we must not record anything for 660 /// that IV, but it's error-prone to expect callers of this routine to care 661 /// about that, hence this explicit parameter. 662 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 663 const Instruction *EntryVal, 664 Value *VectorLoopValue, 665 unsigned Part, 666 unsigned Lane = UINT_MAX); 667 668 /// Generate a shuffle sequence that will reverse the vector Vec. 669 virtual Value *reverseVector(Value *Vec); 670 671 /// Returns (and creates if needed) the original loop trip count. 672 Value *getOrCreateTripCount(Loop *NewLoop); 673 674 /// Returns (and creates if needed) the trip count of the widened loop. 675 Value *getOrCreateVectorTripCount(Loop *NewLoop); 676 677 /// Returns a bitcasted value to the requested vector type. 678 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 679 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 680 const DataLayout &DL); 681 682 /// Emit a bypass check to see if the vector trip count is zero, including if 683 /// it overflows. 684 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 685 686 /// Emit a bypass check to see if all of the SCEV assumptions we've 687 /// had to make are correct. 688 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 689 690 /// Emit bypass checks to check any memory assumptions we may have made. 691 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 692 693 /// Compute the transformed value of Index at offset StartValue using step 694 /// StepValue. 695 /// For integer induction, returns StartValue + Index * StepValue. 696 /// For pointer induction, returns StartValue[Index * StepValue]. 697 /// FIXME: The newly created binary instructions should contain nsw/nuw 698 /// flags, which can be found from the original scalar operations. 699 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 700 const DataLayout &DL, 701 const InductionDescriptor &ID) const; 702 703 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 704 /// vector loop preheader, middle block and scalar preheader. Also 705 /// allocate a loop object for the new vector loop and return it. 706 Loop *createVectorLoopSkeleton(StringRef Prefix); 707 708 /// Create new phi nodes for the induction variables to resume iteration count 709 /// in the scalar epilogue, from where the vectorized loop left off (given by 710 /// \p VectorTripCount). 711 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 712 713 /// Complete the loop skeleton by adding debug MDs, creating appropriate 714 /// conditional branches in the middle block, preparing the builder and 715 /// running the verifier. Take in the vector loop \p L as argument, and return 716 /// the preheader of the completed vector loop. 717 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 718 719 /// Add additional metadata to \p To that was not present on \p Orig. 720 /// 721 /// Currently this is used to add the noalias annotations based on the 722 /// inserted memchecks. Use this for instructions that are *cloned* into the 723 /// vector loop. 724 void addNewMetadata(Instruction *To, const Instruction *Orig); 725 726 /// Add metadata from one instruction to another. 727 /// 728 /// This includes both the original MDs from \p From and additional ones (\see 729 /// addNewMetadata). Use this for *newly created* instructions in the vector 730 /// loop. 731 void addMetadata(Instruction *To, Instruction *From); 732 733 /// Similar to the previous function but it adds the metadata to a 734 /// vector of instructions. 735 void addMetadata(ArrayRef<Value *> To, Instruction *From); 736 737 /// The original loop. 738 Loop *OrigLoop; 739 740 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 741 /// dynamic knowledge to simplify SCEV expressions and converts them to a 742 /// more usable form. 743 PredicatedScalarEvolution &PSE; 744 745 /// Loop Info. 746 LoopInfo *LI; 747 748 /// Dominator Tree. 749 DominatorTree *DT; 750 751 /// Alias Analysis. 752 AAResults *AA; 753 754 /// Target Library Info. 755 const TargetLibraryInfo *TLI; 756 757 /// Target Transform Info. 758 const TargetTransformInfo *TTI; 759 760 /// Assumption Cache. 761 AssumptionCache *AC; 762 763 /// Interface to emit optimization remarks. 764 OptimizationRemarkEmitter *ORE; 765 766 /// LoopVersioning. It's only set up (non-null) if memchecks were 767 /// used. 768 /// 769 /// This is currently only used to add no-alias metadata based on the 770 /// memchecks. The actually versioning is performed manually. 771 std::unique_ptr<LoopVersioning> LVer; 772 773 /// The vectorization SIMD factor to use. Each vector will have this many 774 /// vector elements. 775 ElementCount VF; 776 777 /// The vectorization unroll factor to use. Each scalar is vectorized to this 778 /// many different vector instructions. 779 unsigned UF; 780 781 /// The builder that we use 782 IRBuilder<> Builder; 783 784 // --- Vectorization state --- 785 786 /// The vector-loop preheader. 787 BasicBlock *LoopVectorPreHeader; 788 789 /// The scalar-loop preheader. 790 BasicBlock *LoopScalarPreHeader; 791 792 /// Middle Block between the vector and the scalar. 793 BasicBlock *LoopMiddleBlock; 794 795 /// The ExitBlock of the scalar loop. 796 BasicBlock *LoopExitBlock; 797 798 /// The vector loop body. 799 BasicBlock *LoopVectorBody; 800 801 /// The scalar loop body. 802 BasicBlock *LoopScalarBody; 803 804 /// A list of all bypass blocks. The first block is the entry of the loop. 805 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 806 807 /// The new Induction variable which was added to the new block. 808 PHINode *Induction = nullptr; 809 810 /// The induction variable of the old basic block. 811 PHINode *OldInduction = nullptr; 812 813 /// Maps values from the original loop to their corresponding values in the 814 /// vectorized loop. A key value can map to either vector values, scalar 815 /// values or both kinds of values, depending on whether the key was 816 /// vectorized and scalarized. 817 VectorizerValueMap VectorLoopValueMap; 818 819 /// Store instructions that were predicated. 820 SmallVector<Instruction *, 4> PredicatedInstructions; 821 822 /// Trip count of the original loop. 823 Value *TripCount = nullptr; 824 825 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 826 Value *VectorTripCount = nullptr; 827 828 /// The legality analysis. 829 LoopVectorizationLegality *Legal; 830 831 /// The profitablity analysis. 832 LoopVectorizationCostModel *Cost; 833 834 // Record whether runtime checks are added. 835 bool AddedSafetyChecks = false; 836 837 // Holds the end values for each induction variable. We save the end values 838 // so we can later fix-up the external users of the induction variables. 839 DenseMap<PHINode *, Value *> IVEndValues; 840 841 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 842 // fixed up at the end of vector code generation. 843 SmallVector<PHINode *, 8> OrigPHIsToFix; 844 845 /// BFI and PSI are used to check for profile guided size optimizations. 846 BlockFrequencyInfo *BFI; 847 ProfileSummaryInfo *PSI; 848 849 // Whether this loop should be optimized for size based on profile guided size 850 // optimizatios. 851 bool OptForSizeBasedOnProfile; 852 }; 853 854 class InnerLoopUnroller : public InnerLoopVectorizer { 855 public: 856 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 857 LoopInfo *LI, DominatorTree *DT, 858 const TargetLibraryInfo *TLI, 859 const TargetTransformInfo *TTI, AssumptionCache *AC, 860 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 861 LoopVectorizationLegality *LVL, 862 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 863 ProfileSummaryInfo *PSI) 864 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 865 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 866 BFI, PSI) {} 867 868 private: 869 Value *getBroadcastInstrs(Value *V) override; 870 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 871 Instruction::BinaryOps Opcode = 872 Instruction::BinaryOpsEnd) override; 873 Value *reverseVector(Value *Vec) override; 874 }; 875 876 } // end namespace llvm 877 878 /// Look for a meaningful debug location on the instruction or it's 879 /// operands. 880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 881 if (!I) 882 return I; 883 884 DebugLoc Empty; 885 if (I->getDebugLoc() != Empty) 886 return I; 887 888 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 889 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 890 if (OpInst->getDebugLoc() != Empty) 891 return OpInst; 892 } 893 894 return I; 895 } 896 897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 898 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 899 const DILocation *DIL = Inst->getDebugLoc(); 900 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 901 !isa<DbgInfoIntrinsic>(Inst)) { 902 assert(!VF.isScalable() && "scalable vectors not yet supported."); 903 auto NewDIL = 904 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 905 if (NewDIL) 906 B.SetCurrentDebugLocation(NewDIL.getValue()); 907 else 908 LLVM_DEBUG(dbgs() 909 << "Failed to create new discriminator: " 910 << DIL->getFilename() << " Line: " << DIL->getLine()); 911 } 912 else 913 B.SetCurrentDebugLocation(DIL); 914 } else 915 B.SetCurrentDebugLocation(DebugLoc()); 916 } 917 918 /// Write a record \p DebugMsg about vectorization failure to the debug 919 /// output stream. If \p I is passed, it is an instruction that prevents 920 /// vectorization. 921 #ifndef NDEBUG 922 static void debugVectorizationFailure(const StringRef DebugMsg, 923 Instruction *I) { 924 dbgs() << "LV: Not vectorizing: " << DebugMsg; 925 if (I != nullptr) 926 dbgs() << " " << *I; 927 else 928 dbgs() << '.'; 929 dbgs() << '\n'; 930 } 931 #endif 932 933 /// Create an analysis remark that explains why vectorization failed 934 /// 935 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 936 /// RemarkName is the identifier for the remark. If \p I is passed it is an 937 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 938 /// the location of the remark. \return the remark object that can be 939 /// streamed to. 940 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 941 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 942 Value *CodeRegion = TheLoop->getHeader(); 943 DebugLoc DL = TheLoop->getStartLoc(); 944 945 if (I) { 946 CodeRegion = I->getParent(); 947 // If there is no debug location attached to the instruction, revert back to 948 // using the loop's. 949 if (I->getDebugLoc()) 950 DL = I->getDebugLoc(); 951 } 952 953 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 954 R << "loop not vectorized: "; 955 return R; 956 } 957 958 namespace llvm { 959 960 void reportVectorizationFailure(const StringRef DebugMsg, 961 const StringRef OREMsg, const StringRef ORETag, 962 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 963 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 964 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 965 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 966 ORETag, TheLoop, I) << OREMsg); 967 } 968 969 } // end namespace llvm 970 971 #ifndef NDEBUG 972 /// \return string containing a file name and a line # for the given loop. 973 static std::string getDebugLocString(const Loop *L) { 974 std::string Result; 975 if (L) { 976 raw_string_ostream OS(Result); 977 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 978 LoopDbgLoc.print(OS); 979 else 980 // Just print the module name. 981 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 982 OS.flush(); 983 } 984 return Result; 985 } 986 #endif 987 988 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 989 const Instruction *Orig) { 990 // If the loop was versioned with memchecks, add the corresponding no-alias 991 // metadata. 992 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 993 LVer->annotateInstWithNoAlias(To, Orig); 994 } 995 996 void InnerLoopVectorizer::addMetadata(Instruction *To, 997 Instruction *From) { 998 propagateMetadata(To, From); 999 addNewMetadata(To, From); 1000 } 1001 1002 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1003 Instruction *From) { 1004 for (Value *V : To) { 1005 if (Instruction *I = dyn_cast<Instruction>(V)) 1006 addMetadata(I, From); 1007 } 1008 } 1009 1010 namespace llvm { 1011 1012 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1013 // lowered. 1014 enum ScalarEpilogueLowering { 1015 1016 // The default: allowing scalar epilogues. 1017 CM_ScalarEpilogueAllowed, 1018 1019 // Vectorization with OptForSize: don't allow epilogues. 1020 CM_ScalarEpilogueNotAllowedOptSize, 1021 1022 // A special case of vectorisation with OptForSize: loops with a very small 1023 // trip count are considered for vectorization under OptForSize, thereby 1024 // making sure the cost of their loop body is dominant, free of runtime 1025 // guards and scalar iteration overheads. 1026 CM_ScalarEpilogueNotAllowedLowTripLoop, 1027 1028 // Loop hint predicate indicating an epilogue is undesired. 1029 CM_ScalarEpilogueNotNeededUsePredicate 1030 }; 1031 1032 /// LoopVectorizationCostModel - estimates the expected speedups due to 1033 /// vectorization. 1034 /// In many cases vectorization is not profitable. This can happen because of 1035 /// a number of reasons. In this class we mainly attempt to predict the 1036 /// expected speedup/slowdowns due to the supported instruction set. We use the 1037 /// TargetTransformInfo to query the different backends for the cost of 1038 /// different operations. 1039 class LoopVectorizationCostModel { 1040 public: 1041 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1042 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1043 LoopVectorizationLegality *Legal, 1044 const TargetTransformInfo &TTI, 1045 const TargetLibraryInfo *TLI, DemandedBits *DB, 1046 AssumptionCache *AC, 1047 OptimizationRemarkEmitter *ORE, const Function *F, 1048 const LoopVectorizeHints *Hints, 1049 InterleavedAccessInfo &IAI) 1050 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1051 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1052 Hints(Hints), InterleaveInfo(IAI) {} 1053 1054 /// \return An upper bound for the vectorization factor, or None if 1055 /// vectorization and interleaving should be avoided up front. 1056 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1057 1058 /// \return True if runtime checks are required for vectorization, and false 1059 /// otherwise. 1060 bool runtimeChecksRequired(); 1061 1062 /// \return The most profitable vectorization factor and the cost of that VF. 1063 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1064 /// then this vectorization factor will be selected if vectorization is 1065 /// possible. 1066 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1067 1068 /// Setup cost-based decisions for user vectorization factor. 1069 void selectUserVectorizationFactor(ElementCount UserVF) { 1070 collectUniformsAndScalars(UserVF); 1071 collectInstsToScalarize(UserVF); 1072 } 1073 1074 /// \return The size (in bits) of the smallest and widest types in the code 1075 /// that needs to be vectorized. We ignore values that remain scalar such as 1076 /// 64 bit loop indices. 1077 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1078 1079 /// \return The desired interleave count. 1080 /// If interleave count has been specified by metadata it will be returned. 1081 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1082 /// are the selected vectorization factor and the cost of the selected VF. 1083 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1084 1085 /// Memory access instruction may be vectorized in more than one way. 1086 /// Form of instruction after vectorization depends on cost. 1087 /// This function takes cost-based decisions for Load/Store instructions 1088 /// and collects them in a map. This decisions map is used for building 1089 /// the lists of loop-uniform and loop-scalar instructions. 1090 /// The calculated cost is saved with widening decision in order to 1091 /// avoid redundant calculations. 1092 void setCostBasedWideningDecision(ElementCount VF); 1093 1094 /// A struct that represents some properties of the register usage 1095 /// of a loop. 1096 struct RegisterUsage { 1097 /// Holds the number of loop invariant values that are used in the loop. 1098 /// The key is ClassID of target-provided register class. 1099 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1100 /// Holds the maximum number of concurrent live intervals in the loop. 1101 /// The key is ClassID of target-provided register class. 1102 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1103 }; 1104 1105 /// \return Returns information about the register usages of the loop for the 1106 /// given vectorization factors. 1107 SmallVector<RegisterUsage, 8> 1108 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1109 1110 /// Collect values we want to ignore in the cost model. 1111 void collectValuesToIgnore(); 1112 1113 /// Split reductions into those that happen in the loop, and those that happen 1114 /// outside. In loop reductions are collected into InLoopReductionChains. 1115 void collectInLoopReductions(); 1116 1117 /// \returns The smallest bitwidth each instruction can be represented with. 1118 /// The vector equivalents of these instructions should be truncated to this 1119 /// type. 1120 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1121 return MinBWs; 1122 } 1123 1124 /// \returns True if it is more profitable to scalarize instruction \p I for 1125 /// vectorization factor \p VF. 1126 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1127 assert(VF.isVector() && 1128 "Profitable to scalarize relevant only for VF > 1."); 1129 1130 // Cost model is not run in the VPlan-native path - return conservative 1131 // result until this changes. 1132 if (EnableVPlanNativePath) 1133 return false; 1134 1135 auto Scalars = InstsToScalarize.find(VF); 1136 assert(Scalars != InstsToScalarize.end() && 1137 "VF not yet analyzed for scalarization profitability"); 1138 return Scalars->second.find(I) != Scalars->second.end(); 1139 } 1140 1141 /// Returns true if \p I is known to be uniform after vectorization. 1142 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1143 if (VF.isScalar()) 1144 return true; 1145 1146 // Cost model is not run in the VPlan-native path - return conservative 1147 // result until this changes. 1148 if (EnableVPlanNativePath) 1149 return false; 1150 1151 auto UniformsPerVF = Uniforms.find(VF); 1152 assert(UniformsPerVF != Uniforms.end() && 1153 "VF not yet analyzed for uniformity"); 1154 return UniformsPerVF->second.count(I); 1155 } 1156 1157 /// Returns true if \p I is known to be scalar after vectorization. 1158 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1159 if (VF.isScalar()) 1160 return true; 1161 1162 // Cost model is not run in the VPlan-native path - return conservative 1163 // result until this changes. 1164 if (EnableVPlanNativePath) 1165 return false; 1166 1167 auto ScalarsPerVF = Scalars.find(VF); 1168 assert(ScalarsPerVF != Scalars.end() && 1169 "Scalar values are not calculated for VF"); 1170 return ScalarsPerVF->second.count(I); 1171 } 1172 1173 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1174 /// for vectorization factor \p VF. 1175 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1176 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1177 !isProfitableToScalarize(I, VF) && 1178 !isScalarAfterVectorization(I, VF); 1179 } 1180 1181 /// Decision that was taken during cost calculation for memory instruction. 1182 enum InstWidening { 1183 CM_Unknown, 1184 CM_Widen, // For consecutive accesses with stride +1. 1185 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1186 CM_Interleave, 1187 CM_GatherScatter, 1188 CM_Scalarize 1189 }; 1190 1191 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1192 /// instruction \p I and vector width \p VF. 1193 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1194 unsigned Cost) { 1195 assert(VF.isVector() && "Expected VF >=2"); 1196 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1197 } 1198 1199 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1200 /// interleaving group \p Grp and vector width \p VF. 1201 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1202 ElementCount VF, InstWidening W, unsigned Cost) { 1203 assert(VF.isVector() && "Expected VF >=2"); 1204 /// Broadcast this decicion to all instructions inside the group. 1205 /// But the cost will be assigned to one instruction only. 1206 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1207 if (auto *I = Grp->getMember(i)) { 1208 if (Grp->getInsertPos() == I) 1209 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1210 else 1211 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1212 } 1213 } 1214 } 1215 1216 /// Return the cost model decision for the given instruction \p I and vector 1217 /// width \p VF. Return CM_Unknown if this instruction did not pass 1218 /// through the cost modeling. 1219 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1220 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1221 assert(VF.isVector() && "Expected VF >=2"); 1222 1223 // Cost model is not run in the VPlan-native path - return conservative 1224 // result until this changes. 1225 if (EnableVPlanNativePath) 1226 return CM_GatherScatter; 1227 1228 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1229 auto Itr = WideningDecisions.find(InstOnVF); 1230 if (Itr == WideningDecisions.end()) 1231 return CM_Unknown; 1232 return Itr->second.first; 1233 } 1234 1235 /// Return the vectorization cost for the given instruction \p I and vector 1236 /// width \p VF. 1237 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1238 assert(VF.isVector() && "Expected VF >=2"); 1239 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1240 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1241 "The cost is not calculated"); 1242 return WideningDecisions[InstOnVF].second; 1243 } 1244 1245 /// Return True if instruction \p I is an optimizable truncate whose operand 1246 /// is an induction variable. Such a truncate will be removed by adding a new 1247 /// induction variable with the destination type. 1248 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1249 // If the instruction is not a truncate, return false. 1250 auto *Trunc = dyn_cast<TruncInst>(I); 1251 if (!Trunc) 1252 return false; 1253 1254 // Get the source and destination types of the truncate. 1255 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1256 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1257 1258 // If the truncate is free for the given types, return false. Replacing a 1259 // free truncate with an induction variable would add an induction variable 1260 // update instruction to each iteration of the loop. We exclude from this 1261 // check the primary induction variable since it will need an update 1262 // instruction regardless. 1263 Value *Op = Trunc->getOperand(0); 1264 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1265 return false; 1266 1267 // If the truncated value is not an induction variable, return false. 1268 return Legal->isInductionPhi(Op); 1269 } 1270 1271 /// Collects the instructions to scalarize for each predicated instruction in 1272 /// the loop. 1273 void collectInstsToScalarize(ElementCount VF); 1274 1275 /// Collect Uniform and Scalar values for the given \p VF. 1276 /// The sets depend on CM decision for Load/Store instructions 1277 /// that may be vectorized as interleave, gather-scatter or scalarized. 1278 void collectUniformsAndScalars(ElementCount VF) { 1279 // Do the analysis once. 1280 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1281 return; 1282 setCostBasedWideningDecision(VF); 1283 collectLoopUniforms(VF); 1284 collectLoopScalars(VF); 1285 } 1286 1287 /// Returns true if the target machine supports masked store operation 1288 /// for the given \p DataType and kind of access to \p Ptr. 1289 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1290 return Legal->isConsecutivePtr(Ptr) && 1291 TTI.isLegalMaskedStore(DataType, Alignment); 1292 } 1293 1294 /// Returns true if the target machine supports masked load operation 1295 /// for the given \p DataType and kind of access to \p Ptr. 1296 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1297 return Legal->isConsecutivePtr(Ptr) && 1298 TTI.isLegalMaskedLoad(DataType, Alignment); 1299 } 1300 1301 /// Returns true if the target machine supports masked scatter operation 1302 /// for the given \p DataType. 1303 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1304 return TTI.isLegalMaskedScatter(DataType, Alignment); 1305 } 1306 1307 /// Returns true if the target machine supports masked gather operation 1308 /// for the given \p DataType. 1309 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1310 return TTI.isLegalMaskedGather(DataType, Alignment); 1311 } 1312 1313 /// Returns true if the target machine can represent \p V as a masked gather 1314 /// or scatter operation. 1315 bool isLegalGatherOrScatter(Value *V) { 1316 bool LI = isa<LoadInst>(V); 1317 bool SI = isa<StoreInst>(V); 1318 if (!LI && !SI) 1319 return false; 1320 auto *Ty = getMemInstValueType(V); 1321 Align Align = getLoadStoreAlignment(V); 1322 return (LI && isLegalMaskedGather(Ty, Align)) || 1323 (SI && isLegalMaskedScatter(Ty, Align)); 1324 } 1325 1326 /// Returns true if \p I is an instruction that will be scalarized with 1327 /// predication. Such instructions include conditional stores and 1328 /// instructions that may divide by zero. 1329 /// If a non-zero VF has been calculated, we check if I will be scalarized 1330 /// predication for that VF. 1331 bool isScalarWithPredication(Instruction *I, 1332 ElementCount VF = ElementCount::getFixed(1)); 1333 1334 // Returns true if \p I is an instruction that will be predicated either 1335 // through scalar predication or masked load/store or masked gather/scatter. 1336 // Superset of instructions that return true for isScalarWithPredication. 1337 bool isPredicatedInst(Instruction *I) { 1338 if (!blockNeedsPredication(I->getParent())) 1339 return false; 1340 // Loads and stores that need some form of masked operation are predicated 1341 // instructions. 1342 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1343 return Legal->isMaskRequired(I); 1344 return isScalarWithPredication(I); 1345 } 1346 1347 /// Returns true if \p I is a memory instruction with consecutive memory 1348 /// access that can be widened. 1349 bool 1350 memoryInstructionCanBeWidened(Instruction *I, 1351 ElementCount VF = ElementCount::getFixed(1)); 1352 1353 /// Returns true if \p I is a memory instruction in an interleaved-group 1354 /// of memory accesses that can be vectorized with wide vector loads/stores 1355 /// and shuffles. 1356 bool 1357 interleavedAccessCanBeWidened(Instruction *I, 1358 ElementCount VF = ElementCount::getFixed(1)); 1359 1360 /// Check if \p Instr belongs to any interleaved access group. 1361 bool isAccessInterleaved(Instruction *Instr) { 1362 return InterleaveInfo.isInterleaved(Instr); 1363 } 1364 1365 /// Get the interleaved access group that \p Instr belongs to. 1366 const InterleaveGroup<Instruction> * 1367 getInterleavedAccessGroup(Instruction *Instr) { 1368 return InterleaveInfo.getInterleaveGroup(Instr); 1369 } 1370 1371 /// Returns true if an interleaved group requires a scalar iteration 1372 /// to handle accesses with gaps, and there is nothing preventing us from 1373 /// creating a scalar epilogue. 1374 bool requiresScalarEpilogue() const { 1375 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1376 } 1377 1378 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1379 /// loop hint annotation. 1380 bool isScalarEpilogueAllowed() const { 1381 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1382 } 1383 1384 /// Returns true if all loop blocks should be masked to fold tail loop. 1385 bool foldTailByMasking() const { return FoldTailByMasking; } 1386 1387 bool blockNeedsPredication(BasicBlock *BB) { 1388 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1389 } 1390 1391 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1392 /// nodes to the chain of instructions representing the reductions. Uses a 1393 /// MapVector to ensure deterministic iteration order. 1394 using ReductionChainMap = 1395 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1396 1397 /// Return the chain of instructions representing an inloop reduction. 1398 const ReductionChainMap &getInLoopReductionChains() const { 1399 return InLoopReductionChains; 1400 } 1401 1402 /// Returns true if the Phi is part of an inloop reduction. 1403 bool isInLoopReduction(PHINode *Phi) const { 1404 return InLoopReductionChains.count(Phi); 1405 } 1406 1407 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1408 /// with factor VF. Return the cost of the instruction, including 1409 /// scalarization overhead if it's needed. 1410 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1411 1412 /// Estimate cost of a call instruction CI if it were vectorized with factor 1413 /// VF. Return the cost of the instruction, including scalarization overhead 1414 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1415 /// scalarized - 1416 /// i.e. either vector version isn't available, or is too expensive. 1417 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1418 bool &NeedToScalarize); 1419 1420 /// Invalidates decisions already taken by the cost model. 1421 void invalidateCostModelingDecisions() { 1422 WideningDecisions.clear(); 1423 Uniforms.clear(); 1424 Scalars.clear(); 1425 } 1426 1427 private: 1428 unsigned NumPredStores = 0; 1429 1430 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1431 /// than zero. One is returned if vectorization should best be avoided due 1432 /// to cost. 1433 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1434 1435 /// The vectorization cost is a combination of the cost itself and a boolean 1436 /// indicating whether any of the contributing operations will actually 1437 /// operate on 1438 /// vector values after type legalization in the backend. If this latter value 1439 /// is 1440 /// false, then all operations will be scalarized (i.e. no vectorization has 1441 /// actually taken place). 1442 using VectorizationCostTy = std::pair<unsigned, bool>; 1443 1444 /// Returns the expected execution cost. The unit of the cost does 1445 /// not matter because we use the 'cost' units to compare different 1446 /// vector widths. The cost that is returned is *not* normalized by 1447 /// the factor width. 1448 VectorizationCostTy expectedCost(ElementCount VF); 1449 1450 /// Returns the execution time cost of an instruction for a given vector 1451 /// width. Vector width of one means scalar. 1452 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1453 1454 /// The cost-computation logic from getInstructionCost which provides 1455 /// the vector type as an output parameter. 1456 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1457 1458 /// Calculate vectorization cost of memory instruction \p I. 1459 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1460 1461 /// The cost computation for scalarized memory instruction. 1462 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1463 1464 /// The cost computation for interleaving group of memory instructions. 1465 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1466 1467 /// The cost computation for Gather/Scatter instruction. 1468 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1469 1470 /// The cost computation for widening instruction \p I with consecutive 1471 /// memory access. 1472 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1473 1474 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1475 /// Load: scalar load + broadcast. 1476 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1477 /// element) 1478 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1479 1480 /// Estimate the overhead of scalarizing an instruction. This is a 1481 /// convenience wrapper for the type-based getScalarizationOverhead API. 1482 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1483 1484 /// Returns whether the instruction is a load or store and will be a emitted 1485 /// as a vector operation. 1486 bool isConsecutiveLoadOrStore(Instruction *I); 1487 1488 /// Returns true if an artificially high cost for emulated masked memrefs 1489 /// should be used. 1490 bool useEmulatedMaskMemRefHack(Instruction *I); 1491 1492 /// Map of scalar integer values to the smallest bitwidth they can be legally 1493 /// represented as. The vector equivalents of these values should be truncated 1494 /// to this type. 1495 MapVector<Instruction *, uint64_t> MinBWs; 1496 1497 /// A type representing the costs for instructions if they were to be 1498 /// scalarized rather than vectorized. The entries are Instruction-Cost 1499 /// pairs. 1500 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1501 1502 /// A set containing all BasicBlocks that are known to present after 1503 /// vectorization as a predicated block. 1504 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1505 1506 /// Records whether it is allowed to have the original scalar loop execute at 1507 /// least once. This may be needed as a fallback loop in case runtime 1508 /// aliasing/dependence checks fail, or to handle the tail/remainder 1509 /// iterations when the trip count is unknown or doesn't divide by the VF, 1510 /// or as a peel-loop to handle gaps in interleave-groups. 1511 /// Under optsize and when the trip count is very small we don't allow any 1512 /// iterations to execute in the scalar loop. 1513 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1514 1515 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1516 bool FoldTailByMasking = false; 1517 1518 /// A map holding scalar costs for different vectorization factors. The 1519 /// presence of a cost for an instruction in the mapping indicates that the 1520 /// instruction will be scalarized when vectorizing with the associated 1521 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1522 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1523 1524 /// Holds the instructions known to be uniform after vectorization. 1525 /// The data is collected per VF. 1526 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1527 1528 /// Holds the instructions known to be scalar after vectorization. 1529 /// The data is collected per VF. 1530 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1531 1532 /// Holds the instructions (address computations) that are forced to be 1533 /// scalarized. 1534 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1535 1536 /// PHINodes of the reductions that should be expanded in-loop along with 1537 /// their associated chains of reduction operations, in program order from top 1538 /// (PHI) to bottom 1539 ReductionChainMap InLoopReductionChains; 1540 1541 /// Returns the expected difference in cost from scalarizing the expression 1542 /// feeding a predicated instruction \p PredInst. The instructions to 1543 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1544 /// non-negative return value implies the expression will be scalarized. 1545 /// Currently, only single-use chains are considered for scalarization. 1546 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1547 ElementCount VF); 1548 1549 /// Collect the instructions that are uniform after vectorization. An 1550 /// instruction is uniform if we represent it with a single scalar value in 1551 /// the vectorized loop corresponding to each vector iteration. Examples of 1552 /// uniform instructions include pointer operands of consecutive or 1553 /// interleaved memory accesses. Note that although uniformity implies an 1554 /// instruction will be scalar, the reverse is not true. In general, a 1555 /// scalarized instruction will be represented by VF scalar values in the 1556 /// vectorized loop, each corresponding to an iteration of the original 1557 /// scalar loop. 1558 void collectLoopUniforms(ElementCount VF); 1559 1560 /// Collect the instructions that are scalar after vectorization. An 1561 /// instruction is scalar if it is known to be uniform or will be scalarized 1562 /// during vectorization. Non-uniform scalarized instructions will be 1563 /// represented by VF values in the vectorized loop, each corresponding to an 1564 /// iteration of the original scalar loop. 1565 void collectLoopScalars(ElementCount VF); 1566 1567 /// Keeps cost model vectorization decision and cost for instructions. 1568 /// Right now it is used for memory instructions only. 1569 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1570 std::pair<InstWidening, unsigned>>; 1571 1572 DecisionList WideningDecisions; 1573 1574 /// Returns true if \p V is expected to be vectorized and it needs to be 1575 /// extracted. 1576 bool needsExtract(Value *V, ElementCount VF) const { 1577 Instruction *I = dyn_cast<Instruction>(V); 1578 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1579 TheLoop->isLoopInvariant(I)) 1580 return false; 1581 1582 // Assume we can vectorize V (and hence we need extraction) if the 1583 // scalars are not computed yet. This can happen, because it is called 1584 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1585 // the scalars are collected. That should be a safe assumption in most 1586 // cases, because we check if the operands have vectorizable types 1587 // beforehand in LoopVectorizationLegality. 1588 return Scalars.find(VF) == Scalars.end() || 1589 !isScalarAfterVectorization(I, VF); 1590 }; 1591 1592 /// Returns a range containing only operands needing to be extracted. 1593 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1594 ElementCount VF) { 1595 return SmallVector<Value *, 4>(make_filter_range( 1596 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1597 } 1598 1599 public: 1600 /// The loop that we evaluate. 1601 Loop *TheLoop; 1602 1603 /// Predicated scalar evolution analysis. 1604 PredicatedScalarEvolution &PSE; 1605 1606 /// Loop Info analysis. 1607 LoopInfo *LI; 1608 1609 /// Vectorization legality. 1610 LoopVectorizationLegality *Legal; 1611 1612 /// Vector target information. 1613 const TargetTransformInfo &TTI; 1614 1615 /// Target Library Info. 1616 const TargetLibraryInfo *TLI; 1617 1618 /// Demanded bits analysis. 1619 DemandedBits *DB; 1620 1621 /// Assumption cache. 1622 AssumptionCache *AC; 1623 1624 /// Interface to emit optimization remarks. 1625 OptimizationRemarkEmitter *ORE; 1626 1627 const Function *TheFunction; 1628 1629 /// Loop Vectorize Hint. 1630 const LoopVectorizeHints *Hints; 1631 1632 /// The interleave access information contains groups of interleaved accesses 1633 /// with the same stride and close to each other. 1634 InterleavedAccessInfo &InterleaveInfo; 1635 1636 /// Values to ignore in the cost model. 1637 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1638 1639 /// Values to ignore in the cost model when VF > 1. 1640 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1641 }; 1642 1643 } // end namespace llvm 1644 1645 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1646 // vectorization. The loop needs to be annotated with #pragma omp simd 1647 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1648 // vector length information is not provided, vectorization is not considered 1649 // explicit. Interleave hints are not allowed either. These limitations will be 1650 // relaxed in the future. 1651 // Please, note that we are currently forced to abuse the pragma 'clang 1652 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1653 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1654 // provides *explicit vectorization hints* (LV can bypass legal checks and 1655 // assume that vectorization is legal). However, both hints are implemented 1656 // using the same metadata (llvm.loop.vectorize, processed by 1657 // LoopVectorizeHints). This will be fixed in the future when the native IR 1658 // representation for pragma 'omp simd' is introduced. 1659 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1660 OptimizationRemarkEmitter *ORE) { 1661 assert(!OuterLp->empty() && "This is not an outer loop"); 1662 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1663 1664 // Only outer loops with an explicit vectorization hint are supported. 1665 // Unannotated outer loops are ignored. 1666 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1667 return false; 1668 1669 Function *Fn = OuterLp->getHeader()->getParent(); 1670 if (!Hints.allowVectorization(Fn, OuterLp, 1671 true /*VectorizeOnlyWhenForced*/)) { 1672 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1673 return false; 1674 } 1675 1676 if (Hints.getInterleave() > 1) { 1677 // TODO: Interleave support is future work. 1678 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1679 "outer loops.\n"); 1680 Hints.emitRemarkWithHints(); 1681 return false; 1682 } 1683 1684 return true; 1685 } 1686 1687 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1688 OptimizationRemarkEmitter *ORE, 1689 SmallVectorImpl<Loop *> &V) { 1690 // Collect inner loops and outer loops without irreducible control flow. For 1691 // now, only collect outer loops that have explicit vectorization hints. If we 1692 // are stress testing the VPlan H-CFG construction, we collect the outermost 1693 // loop of every loop nest. 1694 if (L.empty() || VPlanBuildStressTest || 1695 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1696 LoopBlocksRPO RPOT(&L); 1697 RPOT.perform(LI); 1698 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1699 V.push_back(&L); 1700 // TODO: Collect inner loops inside marked outer loops in case 1701 // vectorization fails for the outer loop. Do not invoke 1702 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1703 // already known to be reducible. We can use an inherited attribute for 1704 // that. 1705 return; 1706 } 1707 } 1708 for (Loop *InnerL : L) 1709 collectSupportedLoops(*InnerL, LI, ORE, V); 1710 } 1711 1712 namespace { 1713 1714 /// The LoopVectorize Pass. 1715 struct LoopVectorize : public FunctionPass { 1716 /// Pass identification, replacement for typeid 1717 static char ID; 1718 1719 LoopVectorizePass Impl; 1720 1721 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1722 bool VectorizeOnlyWhenForced = false) 1723 : FunctionPass(ID), 1724 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1725 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1726 } 1727 1728 bool runOnFunction(Function &F) override { 1729 if (skipFunction(F)) 1730 return false; 1731 1732 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1733 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1734 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1735 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1736 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1737 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1738 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1739 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1740 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1741 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1742 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1743 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1744 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1745 1746 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1747 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1748 1749 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1750 GetLAA, *ORE, PSI).MadeAnyChange; 1751 } 1752 1753 void getAnalysisUsage(AnalysisUsage &AU) const override { 1754 AU.addRequired<AssumptionCacheTracker>(); 1755 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1756 AU.addRequired<DominatorTreeWrapperPass>(); 1757 AU.addRequired<LoopInfoWrapperPass>(); 1758 AU.addRequired<ScalarEvolutionWrapperPass>(); 1759 AU.addRequired<TargetTransformInfoWrapperPass>(); 1760 AU.addRequired<AAResultsWrapperPass>(); 1761 AU.addRequired<LoopAccessLegacyAnalysis>(); 1762 AU.addRequired<DemandedBitsWrapperPass>(); 1763 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1764 AU.addRequired<InjectTLIMappingsLegacy>(); 1765 1766 // We currently do not preserve loopinfo/dominator analyses with outer loop 1767 // vectorization. Until this is addressed, mark these analyses as preserved 1768 // only for non-VPlan-native path. 1769 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1770 if (!EnableVPlanNativePath) { 1771 AU.addPreserved<LoopInfoWrapperPass>(); 1772 AU.addPreserved<DominatorTreeWrapperPass>(); 1773 } 1774 1775 AU.addPreserved<BasicAAWrapperPass>(); 1776 AU.addPreserved<GlobalsAAWrapperPass>(); 1777 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1778 } 1779 }; 1780 1781 } // end anonymous namespace 1782 1783 //===----------------------------------------------------------------------===// 1784 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1785 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1786 //===----------------------------------------------------------------------===// 1787 1788 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1789 // We need to place the broadcast of invariant variables outside the loop, 1790 // but only if it's proven safe to do so. Else, broadcast will be inside 1791 // vector loop body. 1792 Instruction *Instr = dyn_cast<Instruction>(V); 1793 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1794 (!Instr || 1795 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1796 // Place the code for broadcasting invariant variables in the new preheader. 1797 IRBuilder<>::InsertPointGuard Guard(Builder); 1798 if (SafeToHoist) 1799 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1800 1801 // Broadcast the scalar into all locations in the vector. 1802 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1803 1804 return Shuf; 1805 } 1806 1807 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1808 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1809 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1810 "Expected either an induction phi-node or a truncate of it!"); 1811 Value *Start = II.getStartValue(); 1812 1813 // Construct the initial value of the vector IV in the vector loop preheader 1814 auto CurrIP = Builder.saveIP(); 1815 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1816 if (isa<TruncInst>(EntryVal)) { 1817 assert(Start->getType()->isIntegerTy() && 1818 "Truncation requires an integer type"); 1819 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1820 Step = Builder.CreateTrunc(Step, TruncType); 1821 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1822 } 1823 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1824 Value *SteppedStart = 1825 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1826 1827 // We create vector phi nodes for both integer and floating-point induction 1828 // variables. Here, we determine the kind of arithmetic we will perform. 1829 Instruction::BinaryOps AddOp; 1830 Instruction::BinaryOps MulOp; 1831 if (Step->getType()->isIntegerTy()) { 1832 AddOp = Instruction::Add; 1833 MulOp = Instruction::Mul; 1834 } else { 1835 AddOp = II.getInductionOpcode(); 1836 MulOp = Instruction::FMul; 1837 } 1838 1839 // Multiply the vectorization factor by the step using integer or 1840 // floating-point arithmetic as appropriate. 1841 Value *ConstVF = 1842 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1843 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1844 1845 // Create a vector splat to use in the induction update. 1846 // 1847 // FIXME: If the step is non-constant, we create the vector splat with 1848 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1849 // handle a constant vector splat. 1850 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1851 Value *SplatVF = isa<Constant>(Mul) 1852 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1853 : Builder.CreateVectorSplat(VF, Mul); 1854 Builder.restoreIP(CurrIP); 1855 1856 // We may need to add the step a number of times, depending on the unroll 1857 // factor. The last of those goes into the PHI. 1858 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1859 &*LoopVectorBody->getFirstInsertionPt()); 1860 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1861 Instruction *LastInduction = VecInd; 1862 for (unsigned Part = 0; Part < UF; ++Part) { 1863 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1864 1865 if (isa<TruncInst>(EntryVal)) 1866 addMetadata(LastInduction, EntryVal); 1867 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1868 1869 LastInduction = cast<Instruction>(addFastMathFlag( 1870 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1871 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1872 } 1873 1874 // Move the last step to the end of the latch block. This ensures consistent 1875 // placement of all induction updates. 1876 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1877 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1878 auto *ICmp = cast<Instruction>(Br->getCondition()); 1879 LastInduction->moveBefore(ICmp); 1880 LastInduction->setName("vec.ind.next"); 1881 1882 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1883 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1884 } 1885 1886 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1887 return Cost->isScalarAfterVectorization(I, VF) || 1888 Cost->isProfitableToScalarize(I, VF); 1889 } 1890 1891 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1892 if (shouldScalarizeInstruction(IV)) 1893 return true; 1894 auto isScalarInst = [&](User *U) -> bool { 1895 auto *I = cast<Instruction>(U); 1896 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1897 }; 1898 return llvm::any_of(IV->users(), isScalarInst); 1899 } 1900 1901 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1902 const InductionDescriptor &ID, const Instruction *EntryVal, 1903 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1904 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1905 "Expected either an induction phi-node or a truncate of it!"); 1906 1907 // This induction variable is not the phi from the original loop but the 1908 // newly-created IV based on the proof that casted Phi is equal to the 1909 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1910 // re-uses the same InductionDescriptor that original IV uses but we don't 1911 // have to do any recording in this case - that is done when original IV is 1912 // processed. 1913 if (isa<TruncInst>(EntryVal)) 1914 return; 1915 1916 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1917 if (Casts.empty()) 1918 return; 1919 // Only the first Cast instruction in the Casts vector is of interest. 1920 // The rest of the Casts (if exist) have no uses outside the 1921 // induction update chain itself. 1922 Instruction *CastInst = *Casts.begin(); 1923 if (Lane < UINT_MAX) 1924 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1925 else 1926 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1927 } 1928 1929 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1930 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1931 "Primary induction variable must have an integer type"); 1932 1933 auto II = Legal->getInductionVars().find(IV); 1934 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1935 1936 auto ID = II->second; 1937 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1938 1939 // The value from the original loop to which we are mapping the new induction 1940 // variable. 1941 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1942 1943 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1944 1945 // Generate code for the induction step. Note that induction steps are 1946 // required to be loop-invariant 1947 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1948 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1949 "Induction step should be loop invariant"); 1950 if (PSE.getSE()->isSCEVable(IV->getType())) { 1951 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1952 return Exp.expandCodeFor(Step, Step->getType(), 1953 LoopVectorPreHeader->getTerminator()); 1954 } 1955 return cast<SCEVUnknown>(Step)->getValue(); 1956 }; 1957 1958 // The scalar value to broadcast. This is derived from the canonical 1959 // induction variable. If a truncation type is given, truncate the canonical 1960 // induction variable and step. Otherwise, derive these values from the 1961 // induction descriptor. 1962 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1963 Value *ScalarIV = Induction; 1964 if (IV != OldInduction) { 1965 ScalarIV = IV->getType()->isIntegerTy() 1966 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1967 : Builder.CreateCast(Instruction::SIToFP, Induction, 1968 IV->getType()); 1969 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1970 ScalarIV->setName("offset.idx"); 1971 } 1972 if (Trunc) { 1973 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1974 assert(Step->getType()->isIntegerTy() && 1975 "Truncation requires an integer step"); 1976 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1977 Step = Builder.CreateTrunc(Step, TruncType); 1978 } 1979 return ScalarIV; 1980 }; 1981 1982 // Create the vector values from the scalar IV, in the absence of creating a 1983 // vector IV. 1984 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1985 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1986 for (unsigned Part = 0; Part < UF; ++Part) { 1987 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1988 Value *EntryPart = 1989 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1990 ID.getInductionOpcode()); 1991 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1992 if (Trunc) 1993 addMetadata(EntryPart, Trunc); 1994 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1995 } 1996 }; 1997 1998 // Now do the actual transformations, and start with creating the step value. 1999 Value *Step = CreateStepValue(ID.getStep()); 2000 if (VF.isZero() || VF.isScalar()) { 2001 Value *ScalarIV = CreateScalarIV(Step); 2002 CreateSplatIV(ScalarIV, Step); 2003 return; 2004 } 2005 2006 // Determine if we want a scalar version of the induction variable. This is 2007 // true if the induction variable itself is not widened, or if it has at 2008 // least one user in the loop that is not widened. 2009 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2010 if (!NeedsScalarIV) { 2011 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2012 return; 2013 } 2014 2015 // Try to create a new independent vector induction variable. If we can't 2016 // create the phi node, we will splat the scalar induction variable in each 2017 // loop iteration. 2018 if (!shouldScalarizeInstruction(EntryVal)) { 2019 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2020 Value *ScalarIV = CreateScalarIV(Step); 2021 // Create scalar steps that can be used by instructions we will later 2022 // scalarize. Note that the addition of the scalar steps will not increase 2023 // the number of instructions in the loop in the common case prior to 2024 // InstCombine. We will be trading one vector extract for each scalar step. 2025 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2026 return; 2027 } 2028 2029 // All IV users are scalar instructions, so only emit a scalar IV, not a 2030 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2031 // predicate used by the masked loads/stores. 2032 Value *ScalarIV = CreateScalarIV(Step); 2033 if (!Cost->isScalarEpilogueAllowed()) 2034 CreateSplatIV(ScalarIV, Step); 2035 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2036 } 2037 2038 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2039 Instruction::BinaryOps BinOp) { 2040 // Create and check the types. 2041 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2042 int VLen = ValVTy->getNumElements(); 2043 2044 Type *STy = Val->getType()->getScalarType(); 2045 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2046 "Induction Step must be an integer or FP"); 2047 assert(Step->getType() == STy && "Step has wrong type"); 2048 2049 SmallVector<Constant *, 8> Indices; 2050 2051 if (STy->isIntegerTy()) { 2052 // Create a vector of consecutive numbers from zero to VF. 2053 for (int i = 0; i < VLen; ++i) 2054 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2055 2056 // Add the consecutive indices to the vector value. 2057 Constant *Cv = ConstantVector::get(Indices); 2058 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2059 Step = Builder.CreateVectorSplat(VLen, Step); 2060 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2061 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2062 // which can be found from the original scalar operations. 2063 Step = Builder.CreateMul(Cv, Step); 2064 return Builder.CreateAdd(Val, Step, "induction"); 2065 } 2066 2067 // Floating point induction. 2068 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2069 "Binary Opcode should be specified for FP induction"); 2070 // Create a vector of consecutive numbers from zero to VF. 2071 for (int i = 0; i < VLen; ++i) 2072 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2073 2074 // Add the consecutive indices to the vector value. 2075 Constant *Cv = ConstantVector::get(Indices); 2076 2077 Step = Builder.CreateVectorSplat(VLen, Step); 2078 2079 // Floating point operations had to be 'fast' to enable the induction. 2080 FastMathFlags Flags; 2081 Flags.setFast(); 2082 2083 Value *MulOp = Builder.CreateFMul(Cv, Step); 2084 if (isa<Instruction>(MulOp)) 2085 // Have to check, MulOp may be a constant 2086 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2087 2088 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2089 if (isa<Instruction>(BOp)) 2090 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2091 return BOp; 2092 } 2093 2094 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2095 Instruction *EntryVal, 2096 const InductionDescriptor &ID) { 2097 // We shouldn't have to build scalar steps if we aren't vectorizing. 2098 assert(VF.isVector() && "VF should be greater than one"); 2099 assert(!VF.isScalable() && 2100 "the code below assumes a fixed number of elements at compile time"); 2101 // Get the value type and ensure it and the step have the same integer type. 2102 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2103 assert(ScalarIVTy == Step->getType() && 2104 "Val and Step should have the same type"); 2105 2106 // We build scalar steps for both integer and floating-point induction 2107 // variables. Here, we determine the kind of arithmetic we will perform. 2108 Instruction::BinaryOps AddOp; 2109 Instruction::BinaryOps MulOp; 2110 if (ScalarIVTy->isIntegerTy()) { 2111 AddOp = Instruction::Add; 2112 MulOp = Instruction::Mul; 2113 } else { 2114 AddOp = ID.getInductionOpcode(); 2115 MulOp = Instruction::FMul; 2116 } 2117 2118 // Determine the number of scalars we need to generate for each unroll 2119 // iteration. If EntryVal is uniform, we only need to generate the first 2120 // lane. Otherwise, we generate all VF values. 2121 unsigned Lanes = 2122 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2123 ? 1 2124 : VF.getKnownMinValue(); 2125 // Compute the scalar steps and save the results in VectorLoopValueMap. 2126 for (unsigned Part = 0; Part < UF; ++Part) { 2127 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2128 auto *StartIdx = getSignedIntOrFpConstant( 2129 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2130 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2131 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2132 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2133 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2134 } 2135 } 2136 } 2137 2138 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2139 assert(V != Induction && "The new induction variable should not be used."); 2140 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2141 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2142 2143 // If we have a stride that is replaced by one, do it here. Defer this for 2144 // the VPlan-native path until we start running Legal checks in that path. 2145 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2146 V = ConstantInt::get(V->getType(), 1); 2147 2148 // If we have a vector mapped to this value, return it. 2149 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2150 return VectorLoopValueMap.getVectorValue(V, Part); 2151 2152 // If the value has not been vectorized, check if it has been scalarized 2153 // instead. If it has been scalarized, and we actually need the value in 2154 // vector form, we will construct the vector values on demand. 2155 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2156 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2157 2158 // If we've scalarized a value, that value should be an instruction. 2159 auto *I = cast<Instruction>(V); 2160 2161 // If we aren't vectorizing, we can just copy the scalar map values over to 2162 // the vector map. 2163 if (VF == 1) { 2164 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2165 return ScalarValue; 2166 } 2167 2168 // Get the last scalar instruction we generated for V and Part. If the value 2169 // is known to be uniform after vectorization, this corresponds to lane zero 2170 // of the Part unroll iteration. Otherwise, the last instruction is the one 2171 // we created for the last vector lane of the Part unroll iteration. 2172 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2173 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2174 ? 0 2175 : VF.getKnownMinValue() - 1; 2176 auto *LastInst = cast<Instruction>( 2177 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2178 2179 // Set the insert point after the last scalarized instruction. This ensures 2180 // the insertelement sequence will directly follow the scalar definitions. 2181 auto OldIP = Builder.saveIP(); 2182 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2183 Builder.SetInsertPoint(&*NewIP); 2184 2185 // However, if we are vectorizing, we need to construct the vector values. 2186 // If the value is known to be uniform after vectorization, we can just 2187 // broadcast the scalar value corresponding to lane zero for each unroll 2188 // iteration. Otherwise, we construct the vector values using insertelement 2189 // instructions. Since the resulting vectors are stored in 2190 // VectorLoopValueMap, we will only generate the insertelements once. 2191 Value *VectorValue = nullptr; 2192 if (Cost->isUniformAfterVectorization(I, VF)) { 2193 VectorValue = getBroadcastInstrs(ScalarValue); 2194 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2195 } else { 2196 // Initialize packing with insertelements to start from undef. 2197 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2198 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2199 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2200 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2201 packScalarIntoVectorValue(V, {Part, Lane}); 2202 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2203 } 2204 Builder.restoreIP(OldIP); 2205 return VectorValue; 2206 } 2207 2208 // If this scalar is unknown, assume that it is a constant or that it is 2209 // loop invariant. Broadcast V and save the value for future uses. 2210 Value *B = getBroadcastInstrs(V); 2211 VectorLoopValueMap.setVectorValue(V, Part, B); 2212 return B; 2213 } 2214 2215 Value * 2216 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2217 const VPIteration &Instance) { 2218 // If the value is not an instruction contained in the loop, it should 2219 // already be scalar. 2220 if (OrigLoop->isLoopInvariant(V)) 2221 return V; 2222 2223 assert(Instance.Lane > 0 2224 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2225 : true && "Uniform values only have lane zero"); 2226 2227 // If the value from the original loop has not been vectorized, it is 2228 // represented by UF x VF scalar values in the new loop. Return the requested 2229 // scalar value. 2230 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2231 return VectorLoopValueMap.getScalarValue(V, Instance); 2232 2233 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2234 // for the given unroll part. If this entry is not a vector type (i.e., the 2235 // vectorization factor is one), there is no need to generate an 2236 // extractelement instruction. 2237 auto *U = getOrCreateVectorValue(V, Instance.Part); 2238 if (!U->getType()->isVectorTy()) { 2239 assert(VF == 1 && "Value not scalarized has non-vector type"); 2240 return U; 2241 } 2242 2243 // Otherwise, the value from the original loop has been vectorized and is 2244 // represented by UF vector values. Extract and return the requested scalar 2245 // value from the appropriate vector lane. 2246 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2247 } 2248 2249 void InnerLoopVectorizer::packScalarIntoVectorValue( 2250 Value *V, const VPIteration &Instance) { 2251 assert(V != Induction && "The new induction variable should not be used."); 2252 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2253 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2254 2255 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2256 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2257 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2258 Builder.getInt32(Instance.Lane)); 2259 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2260 } 2261 2262 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2263 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2264 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2265 SmallVector<int, 8> ShuffleMask; 2266 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2267 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2268 2269 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2270 ShuffleMask, "reverse"); 2271 } 2272 2273 // Return whether we allow using masked interleave-groups (for dealing with 2274 // strided loads/stores that reside in predicated blocks, or for dealing 2275 // with gaps). 2276 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2277 // If an override option has been passed in for interleaved accesses, use it. 2278 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2279 return EnableMaskedInterleavedMemAccesses; 2280 2281 return TTI.enableMaskedInterleavedAccessVectorization(); 2282 } 2283 2284 // Try to vectorize the interleave group that \p Instr belongs to. 2285 // 2286 // E.g. Translate following interleaved load group (factor = 3): 2287 // for (i = 0; i < N; i+=3) { 2288 // R = Pic[i]; // Member of index 0 2289 // G = Pic[i+1]; // Member of index 1 2290 // B = Pic[i+2]; // Member of index 2 2291 // ... // do something to R, G, B 2292 // } 2293 // To: 2294 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2295 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2296 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2297 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2298 // 2299 // Or translate following interleaved store group (factor = 3): 2300 // for (i = 0; i < N; i+=3) { 2301 // ... do something to R, G, B 2302 // Pic[i] = R; // Member of index 0 2303 // Pic[i+1] = G; // Member of index 1 2304 // Pic[i+2] = B; // Member of index 2 2305 // } 2306 // To: 2307 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2308 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2309 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2310 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2311 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2312 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2313 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2314 VPValue *Addr, VPValue *BlockInMask) { 2315 Instruction *Instr = Group->getInsertPos(); 2316 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2317 2318 // Prepare for the vector type of the interleaved load/store. 2319 Type *ScalarTy = getMemInstValueType(Instr); 2320 unsigned InterleaveFactor = Group->getFactor(); 2321 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2322 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2323 2324 // Prepare for the new pointers. 2325 SmallVector<Value *, 2> AddrParts; 2326 unsigned Index = Group->getIndex(Instr); 2327 2328 // TODO: extend the masked interleaved-group support to reversed access. 2329 assert((!BlockInMask || !Group->isReverse()) && 2330 "Reversed masked interleave-group not supported."); 2331 2332 // If the group is reverse, adjust the index to refer to the last vector lane 2333 // instead of the first. We adjust the index from the first vector lane, 2334 // rather than directly getting the pointer for lane VF - 1, because the 2335 // pointer operand of the interleaved access is supposed to be uniform. For 2336 // uniform instructions, we're only required to generate a value for the 2337 // first vector lane in each unroll iteration. 2338 assert(!VF.isScalable() && 2339 "scalable vector reverse operation is not implemented"); 2340 if (Group->isReverse()) 2341 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2342 2343 for (unsigned Part = 0; Part < UF; Part++) { 2344 Value *AddrPart = State.get(Addr, {Part, 0}); 2345 setDebugLocFromInst(Builder, AddrPart); 2346 2347 // Notice current instruction could be any index. Need to adjust the address 2348 // to the member of index 0. 2349 // 2350 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2351 // b = A[i]; // Member of index 0 2352 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2353 // 2354 // E.g. A[i+1] = a; // Member of index 1 2355 // A[i] = b; // Member of index 0 2356 // A[i+2] = c; // Member of index 2 (Current instruction) 2357 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2358 2359 bool InBounds = false; 2360 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2361 InBounds = gep->isInBounds(); 2362 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2363 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2364 2365 // Cast to the vector pointer type. 2366 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2367 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2368 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2369 } 2370 2371 setDebugLocFromInst(Builder, Instr); 2372 Value *UndefVec = UndefValue::get(VecTy); 2373 2374 Value *MaskForGaps = nullptr; 2375 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2376 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2377 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2378 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2379 } 2380 2381 // Vectorize the interleaved load group. 2382 if (isa<LoadInst>(Instr)) { 2383 // For each unroll part, create a wide load for the group. 2384 SmallVector<Value *, 2> NewLoads; 2385 for (unsigned Part = 0; Part < UF; Part++) { 2386 Instruction *NewLoad; 2387 if (BlockInMask || MaskForGaps) { 2388 assert(useMaskedInterleavedAccesses(*TTI) && 2389 "masked interleaved groups are not allowed."); 2390 Value *GroupMask = MaskForGaps; 2391 if (BlockInMask) { 2392 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2393 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2394 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2395 Value *ShuffledMask = Builder.CreateShuffleVector( 2396 BlockInMaskPart, Undefs, 2397 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2398 "interleaved.mask"); 2399 GroupMask = MaskForGaps 2400 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2401 MaskForGaps) 2402 : ShuffledMask; 2403 } 2404 NewLoad = 2405 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2406 GroupMask, UndefVec, "wide.masked.vec"); 2407 } 2408 else 2409 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2410 Group->getAlign(), "wide.vec"); 2411 Group->addMetadata(NewLoad); 2412 NewLoads.push_back(NewLoad); 2413 } 2414 2415 // For each member in the group, shuffle out the appropriate data from the 2416 // wide loads. 2417 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2418 Instruction *Member = Group->getMember(I); 2419 2420 // Skip the gaps in the group. 2421 if (!Member) 2422 continue; 2423 2424 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2425 auto StrideMask = 2426 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2427 for (unsigned Part = 0; Part < UF; Part++) { 2428 Value *StridedVec = Builder.CreateShuffleVector( 2429 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2430 2431 // If this member has different type, cast the result type. 2432 if (Member->getType() != ScalarTy) { 2433 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2434 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2435 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2436 } 2437 2438 if (Group->isReverse()) 2439 StridedVec = reverseVector(StridedVec); 2440 2441 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2442 } 2443 } 2444 return; 2445 } 2446 2447 // The sub vector type for current instruction. 2448 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2449 auto *SubVT = VectorType::get(ScalarTy, VF); 2450 2451 // Vectorize the interleaved store group. 2452 for (unsigned Part = 0; Part < UF; Part++) { 2453 // Collect the stored vector from each member. 2454 SmallVector<Value *, 4> StoredVecs; 2455 for (unsigned i = 0; i < InterleaveFactor; i++) { 2456 // Interleaved store group doesn't allow a gap, so each index has a member 2457 Instruction *Member = Group->getMember(i); 2458 assert(Member && "Fail to get a member from an interleaved store group"); 2459 2460 Value *StoredVec = getOrCreateVectorValue( 2461 cast<StoreInst>(Member)->getValueOperand(), Part); 2462 if (Group->isReverse()) 2463 StoredVec = reverseVector(StoredVec); 2464 2465 // If this member has different type, cast it to a unified type. 2466 2467 if (StoredVec->getType() != SubVT) 2468 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2469 2470 StoredVecs.push_back(StoredVec); 2471 } 2472 2473 // Concatenate all vectors into a wide vector. 2474 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2475 2476 // Interleave the elements in the wide vector. 2477 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2478 Value *IVec = Builder.CreateShuffleVector( 2479 WideVec, UndefVec, 2480 createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2481 "interleaved.vec"); 2482 2483 Instruction *NewStoreInstr; 2484 if (BlockInMask) { 2485 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2486 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2487 Value *ShuffledMask = Builder.CreateShuffleVector( 2488 BlockInMaskPart, Undefs, 2489 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2490 "interleaved.mask"); 2491 NewStoreInstr = Builder.CreateMaskedStore( 2492 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2493 } 2494 else 2495 NewStoreInstr = 2496 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2497 2498 Group->addMetadata(NewStoreInstr); 2499 } 2500 } 2501 2502 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2503 VPTransformState &State, 2504 VPValue *Addr, 2505 VPValue *StoredValue, 2506 VPValue *BlockInMask) { 2507 // Attempt to issue a wide load. 2508 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2509 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2510 2511 assert((LI || SI) && "Invalid Load/Store instruction"); 2512 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2513 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2514 2515 LoopVectorizationCostModel::InstWidening Decision = 2516 Cost->getWideningDecision(Instr, VF); 2517 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2518 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2519 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2520 "CM decision is not to widen the memory instruction"); 2521 2522 Type *ScalarDataTy = getMemInstValueType(Instr); 2523 2524 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2525 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2526 const Align Alignment = getLoadStoreAlignment(Instr); 2527 2528 // Determine if the pointer operand of the access is either consecutive or 2529 // reverse consecutive. 2530 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2531 bool ConsecutiveStride = 2532 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2533 bool CreateGatherScatter = 2534 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2535 2536 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2537 // gather/scatter. Otherwise Decision should have been to Scalarize. 2538 assert((ConsecutiveStride || CreateGatherScatter) && 2539 "The instruction should be scalarized"); 2540 (void)ConsecutiveStride; 2541 2542 VectorParts BlockInMaskParts(UF); 2543 bool isMaskRequired = BlockInMask; 2544 if (isMaskRequired) 2545 for (unsigned Part = 0; Part < UF; ++Part) 2546 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2547 2548 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2549 // Calculate the pointer for the specific unroll-part. 2550 GetElementPtrInst *PartPtr = nullptr; 2551 2552 bool InBounds = false; 2553 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2554 InBounds = gep->isInBounds(); 2555 2556 if (Reverse) { 2557 // If the address is consecutive but reversed, then the 2558 // wide store needs to start at the last vector element. 2559 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2560 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2561 PartPtr->setIsInBounds(InBounds); 2562 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2563 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2564 PartPtr->setIsInBounds(InBounds); 2565 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2566 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2567 } else { 2568 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2569 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2570 PartPtr->setIsInBounds(InBounds); 2571 } 2572 2573 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2574 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2575 }; 2576 2577 // Handle Stores: 2578 if (SI) { 2579 setDebugLocFromInst(Builder, SI); 2580 2581 for (unsigned Part = 0; Part < UF; ++Part) { 2582 Instruction *NewSI = nullptr; 2583 Value *StoredVal = State.get(StoredValue, Part); 2584 if (CreateGatherScatter) { 2585 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2586 Value *VectorGep = State.get(Addr, Part); 2587 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2588 MaskPart); 2589 } else { 2590 if (Reverse) { 2591 // If we store to reverse consecutive memory locations, then we need 2592 // to reverse the order of elements in the stored value. 2593 StoredVal = reverseVector(StoredVal); 2594 // We don't want to update the value in the map as it might be used in 2595 // another expression. So don't call resetVectorValue(StoredVal). 2596 } 2597 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2598 if (isMaskRequired) 2599 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2600 BlockInMaskParts[Part]); 2601 else 2602 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2603 } 2604 addMetadata(NewSI, SI); 2605 } 2606 return; 2607 } 2608 2609 // Handle loads. 2610 assert(LI && "Must have a load instruction"); 2611 setDebugLocFromInst(Builder, LI); 2612 for (unsigned Part = 0; Part < UF; ++Part) { 2613 Value *NewLI; 2614 if (CreateGatherScatter) { 2615 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2616 Value *VectorGep = State.get(Addr, Part); 2617 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2618 nullptr, "wide.masked.gather"); 2619 addMetadata(NewLI, LI); 2620 } else { 2621 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2622 if (isMaskRequired) 2623 NewLI = Builder.CreateMaskedLoad( 2624 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2625 "wide.masked.load"); 2626 else 2627 NewLI = 2628 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2629 2630 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2631 addMetadata(NewLI, LI); 2632 if (Reverse) 2633 NewLI = reverseVector(NewLI); 2634 } 2635 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2636 } 2637 } 2638 2639 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2640 const VPIteration &Instance, 2641 bool IfPredicateInstr, 2642 VPTransformState &State) { 2643 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2644 2645 setDebugLocFromInst(Builder, Instr); 2646 2647 // Does this instruction return a value ? 2648 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2649 2650 Instruction *Cloned = Instr->clone(); 2651 if (!IsVoidRetTy) 2652 Cloned->setName(Instr->getName() + ".cloned"); 2653 2654 // Replace the operands of the cloned instructions with their scalar 2655 // equivalents in the new loop. 2656 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2657 auto *NewOp = State.get(User.getOperand(op), Instance); 2658 Cloned->setOperand(op, NewOp); 2659 } 2660 addNewMetadata(Cloned, Instr); 2661 2662 // Place the cloned scalar in the new loop. 2663 Builder.Insert(Cloned); 2664 2665 // Add the cloned scalar to the scalar map entry. 2666 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2667 2668 // If we just cloned a new assumption, add it the assumption cache. 2669 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2670 if (II->getIntrinsicID() == Intrinsic::assume) 2671 AC->registerAssumption(II); 2672 2673 // End if-block. 2674 if (IfPredicateInstr) 2675 PredicatedInstructions.push_back(Cloned); 2676 } 2677 2678 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2679 Value *End, Value *Step, 2680 Instruction *DL) { 2681 BasicBlock *Header = L->getHeader(); 2682 BasicBlock *Latch = L->getLoopLatch(); 2683 // As we're just creating this loop, it's possible no latch exists 2684 // yet. If so, use the header as this will be a single block loop. 2685 if (!Latch) 2686 Latch = Header; 2687 2688 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2689 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2690 setDebugLocFromInst(Builder, OldInst); 2691 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2692 2693 Builder.SetInsertPoint(Latch->getTerminator()); 2694 setDebugLocFromInst(Builder, OldInst); 2695 2696 // Create i+1 and fill the PHINode. 2697 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2698 Induction->addIncoming(Start, L->getLoopPreheader()); 2699 Induction->addIncoming(Next, Latch); 2700 // Create the compare. 2701 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2702 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2703 2704 // Now we have two terminators. Remove the old one from the block. 2705 Latch->getTerminator()->eraseFromParent(); 2706 2707 return Induction; 2708 } 2709 2710 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2711 if (TripCount) 2712 return TripCount; 2713 2714 assert(L && "Create Trip Count for null loop."); 2715 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2716 // Find the loop boundaries. 2717 ScalarEvolution *SE = PSE.getSE(); 2718 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2719 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2720 "Invalid loop count"); 2721 2722 Type *IdxTy = Legal->getWidestInductionType(); 2723 assert(IdxTy && "No type for induction"); 2724 2725 // The exit count might have the type of i64 while the phi is i32. This can 2726 // happen if we have an induction variable that is sign extended before the 2727 // compare. The only way that we get a backedge taken count is that the 2728 // induction variable was signed and as such will not overflow. In such a case 2729 // truncation is legal. 2730 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2731 IdxTy->getPrimitiveSizeInBits()) 2732 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2733 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2734 2735 // Get the total trip count from the count by adding 1. 2736 const SCEV *ExitCount = SE->getAddExpr( 2737 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2738 2739 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2740 2741 // Expand the trip count and place the new instructions in the preheader. 2742 // Notice that the pre-header does not change, only the loop body. 2743 SCEVExpander Exp(*SE, DL, "induction"); 2744 2745 // Count holds the overall loop count (N). 2746 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2747 L->getLoopPreheader()->getTerminator()); 2748 2749 if (TripCount->getType()->isPointerTy()) 2750 TripCount = 2751 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2752 L->getLoopPreheader()->getTerminator()); 2753 2754 return TripCount; 2755 } 2756 2757 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2758 if (VectorTripCount) 2759 return VectorTripCount; 2760 2761 Value *TC = getOrCreateTripCount(L); 2762 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2763 2764 Type *Ty = TC->getType(); 2765 // This is where we can make the step a runtime constant. 2766 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2767 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2768 2769 // If the tail is to be folded by masking, round the number of iterations N 2770 // up to a multiple of Step instead of rounding down. This is done by first 2771 // adding Step-1 and then rounding down. Note that it's ok if this addition 2772 // overflows: the vector induction variable will eventually wrap to zero given 2773 // that it starts at zero and its Step is a power of two; the loop will then 2774 // exit, with the last early-exit vector comparison also producing all-true. 2775 if (Cost->foldTailByMasking()) { 2776 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2777 "VF*UF must be a power of 2 when folding tail by masking"); 2778 TC = Builder.CreateAdd( 2779 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2780 } 2781 2782 // Now we need to generate the expression for the part of the loop that the 2783 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2784 // iterations are not required for correctness, or N - Step, otherwise. Step 2785 // is equal to the vectorization factor (number of SIMD elements) times the 2786 // unroll factor (number of SIMD instructions). 2787 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2788 2789 // If there is a non-reversed interleaved group that may speculatively access 2790 // memory out-of-bounds, we need to ensure that there will be at least one 2791 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2792 // the trip count, we set the remainder to be equal to the step. If the step 2793 // does not evenly divide the trip count, no adjustment is necessary since 2794 // there will already be scalar iterations. Note that the minimum iterations 2795 // check ensures that N >= Step. 2796 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2797 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2798 R = Builder.CreateSelect(IsZero, Step, R); 2799 } 2800 2801 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2802 2803 return VectorTripCount; 2804 } 2805 2806 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2807 const DataLayout &DL) { 2808 // Verify that V is a vector type with same number of elements as DstVTy. 2809 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2810 unsigned VF = DstFVTy->getNumElements(); 2811 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2812 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2813 Type *SrcElemTy = SrcVecTy->getElementType(); 2814 Type *DstElemTy = DstFVTy->getElementType(); 2815 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2816 "Vector elements must have same size"); 2817 2818 // Do a direct cast if element types are castable. 2819 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2820 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2821 } 2822 // V cannot be directly casted to desired vector type. 2823 // May happen when V is a floating point vector but DstVTy is a vector of 2824 // pointers or vice-versa. Handle this using a two-step bitcast using an 2825 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2826 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2827 "Only one type should be a pointer type"); 2828 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2829 "Only one type should be a floating point type"); 2830 Type *IntTy = 2831 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2832 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2833 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2834 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2835 } 2836 2837 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2838 BasicBlock *Bypass) { 2839 Value *Count = getOrCreateTripCount(L); 2840 // Reuse existing vector loop preheader for TC checks. 2841 // Note that new preheader block is generated for vector loop. 2842 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2843 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2844 2845 // Generate code to check if the loop's trip count is less than VF * UF, or 2846 // equal to it in case a scalar epilogue is required; this implies that the 2847 // vector trip count is zero. This check also covers the case where adding one 2848 // to the backedge-taken count overflowed leading to an incorrect trip count 2849 // of zero. In this case we will also jump to the scalar loop. 2850 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2851 : ICmpInst::ICMP_ULT; 2852 2853 // If tail is to be folded, vector loop takes care of all iterations. 2854 Value *CheckMinIters = Builder.getFalse(); 2855 if (!Cost->foldTailByMasking()) { 2856 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2857 CheckMinIters = Builder.CreateICmp( 2858 P, Count, 2859 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2860 "min.iters.check"); 2861 } 2862 // Create new preheader for vector loop. 2863 LoopVectorPreHeader = 2864 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2865 "vector.ph"); 2866 2867 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2868 DT->getNode(Bypass)->getIDom()) && 2869 "TC check is expected to dominate Bypass"); 2870 2871 // Update dominator for Bypass & LoopExit. 2872 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2873 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2874 2875 ReplaceInstWithInst( 2876 TCCheckBlock->getTerminator(), 2877 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2878 LoopBypassBlocks.push_back(TCCheckBlock); 2879 } 2880 2881 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2882 // Reuse existing vector loop preheader for SCEV checks. 2883 // Note that new preheader block is generated for vector loop. 2884 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2885 2886 // Generate the code to check that the SCEV assumptions that we made. 2887 // We want the new basic block to start at the first instruction in a 2888 // sequence of instructions that form a check. 2889 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2890 "scev.check"); 2891 Value *SCEVCheck = Exp.expandCodeForPredicate( 2892 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2893 2894 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2895 if (C->isZero()) 2896 return; 2897 2898 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2899 (OptForSizeBasedOnProfile && 2900 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2901 "Cannot SCEV check stride or overflow when optimizing for size"); 2902 2903 SCEVCheckBlock->setName("vector.scevcheck"); 2904 // Create new preheader for vector loop. 2905 LoopVectorPreHeader = 2906 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2907 nullptr, "vector.ph"); 2908 2909 // Update dominator only if this is first RT check. 2910 if (LoopBypassBlocks.empty()) { 2911 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2912 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2913 } 2914 2915 ReplaceInstWithInst( 2916 SCEVCheckBlock->getTerminator(), 2917 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2918 LoopBypassBlocks.push_back(SCEVCheckBlock); 2919 AddedSafetyChecks = true; 2920 } 2921 2922 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2923 // VPlan-native path does not do any analysis for runtime checks currently. 2924 if (EnableVPlanNativePath) 2925 return; 2926 2927 // Reuse existing vector loop preheader for runtime memory checks. 2928 // Note that new preheader block is generated for vector loop. 2929 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2930 2931 // Generate the code that checks in runtime if arrays overlap. We put the 2932 // checks into a separate block to make the more common case of few elements 2933 // faster. 2934 auto *LAI = Legal->getLAI(); 2935 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2936 if (!RtPtrChecking.Need) 2937 return; 2938 2939 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2940 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2941 "Cannot emit memory checks when optimizing for size, unless forced " 2942 "to vectorize."); 2943 ORE->emit([&]() { 2944 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2945 L->getStartLoc(), L->getHeader()) 2946 << "Code-size may be reduced by not forcing " 2947 "vectorization, or by source-code modifications " 2948 "eliminating the need for runtime checks " 2949 "(e.g., adding 'restrict')."; 2950 }); 2951 } 2952 2953 MemCheckBlock->setName("vector.memcheck"); 2954 // Create new preheader for vector loop. 2955 LoopVectorPreHeader = 2956 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2957 "vector.ph"); 2958 2959 auto *CondBranch = cast<BranchInst>( 2960 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 2961 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 2962 LoopBypassBlocks.push_back(MemCheckBlock); 2963 AddedSafetyChecks = true; 2964 2965 // Update dominator only if this is first RT check. 2966 if (LoopBypassBlocks.empty()) { 2967 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2968 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2969 } 2970 2971 Instruction *FirstCheckInst; 2972 Instruction *MemRuntimeCheck; 2973 std::tie(FirstCheckInst, MemRuntimeCheck) = 2974 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2975 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2976 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2977 "claimed checks are required"); 2978 CondBranch->setCondition(MemRuntimeCheck); 2979 2980 // We currently don't use LoopVersioning for the actual loop cloning but we 2981 // still use it to add the noalias metadata. 2982 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2983 PSE.getSE()); 2984 LVer->prepareNoAliasMetadata(); 2985 } 2986 2987 Value *InnerLoopVectorizer::emitTransformedIndex( 2988 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2989 const InductionDescriptor &ID) const { 2990 2991 SCEVExpander Exp(*SE, DL, "induction"); 2992 auto Step = ID.getStep(); 2993 auto StartValue = ID.getStartValue(); 2994 assert(Index->getType() == Step->getType() && 2995 "Index type does not match StepValue type"); 2996 2997 // Note: the IR at this point is broken. We cannot use SE to create any new 2998 // SCEV and then expand it, hoping that SCEV's simplification will give us 2999 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3000 // lead to various SCEV crashes. So all we can do is to use builder and rely 3001 // on InstCombine for future simplifications. Here we handle some trivial 3002 // cases only. 3003 auto CreateAdd = [&B](Value *X, Value *Y) { 3004 assert(X->getType() == Y->getType() && "Types don't match!"); 3005 if (auto *CX = dyn_cast<ConstantInt>(X)) 3006 if (CX->isZero()) 3007 return Y; 3008 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3009 if (CY->isZero()) 3010 return X; 3011 return B.CreateAdd(X, Y); 3012 }; 3013 3014 auto CreateMul = [&B](Value *X, Value *Y) { 3015 assert(X->getType() == Y->getType() && "Types don't match!"); 3016 if (auto *CX = dyn_cast<ConstantInt>(X)) 3017 if (CX->isOne()) 3018 return Y; 3019 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3020 if (CY->isOne()) 3021 return X; 3022 return B.CreateMul(X, Y); 3023 }; 3024 3025 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3026 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3027 // the DomTree is not kept up-to-date for additional blocks generated in the 3028 // vector loop. By using the header as insertion point, we guarantee that the 3029 // expanded instructions dominate all their uses. 3030 auto GetInsertPoint = [this, &B]() { 3031 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3032 if (InsertBB != LoopVectorBody && 3033 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3034 return LoopVectorBody->getTerminator(); 3035 return &*B.GetInsertPoint(); 3036 }; 3037 switch (ID.getKind()) { 3038 case InductionDescriptor::IK_IntInduction: { 3039 assert(Index->getType() == StartValue->getType() && 3040 "Index type does not match StartValue type"); 3041 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3042 return B.CreateSub(StartValue, Index); 3043 auto *Offset = CreateMul( 3044 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3045 return CreateAdd(StartValue, Offset); 3046 } 3047 case InductionDescriptor::IK_PtrInduction: { 3048 assert(isa<SCEVConstant>(Step) && 3049 "Expected constant step for pointer induction"); 3050 return B.CreateGEP( 3051 StartValue->getType()->getPointerElementType(), StartValue, 3052 CreateMul(Index, 3053 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3054 } 3055 case InductionDescriptor::IK_FpInduction: { 3056 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3057 auto InductionBinOp = ID.getInductionBinOp(); 3058 assert(InductionBinOp && 3059 (InductionBinOp->getOpcode() == Instruction::FAdd || 3060 InductionBinOp->getOpcode() == Instruction::FSub) && 3061 "Original bin op should be defined for FP induction"); 3062 3063 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3064 3065 // Floating point operations had to be 'fast' to enable the induction. 3066 FastMathFlags Flags; 3067 Flags.setFast(); 3068 3069 Value *MulExp = B.CreateFMul(StepValue, Index); 3070 if (isa<Instruction>(MulExp)) 3071 // We have to check, the MulExp may be a constant. 3072 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3073 3074 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3075 "induction"); 3076 if (isa<Instruction>(BOp)) 3077 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3078 3079 return BOp; 3080 } 3081 case InductionDescriptor::IK_NoInduction: 3082 return nullptr; 3083 } 3084 llvm_unreachable("invalid enum"); 3085 } 3086 3087 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3088 LoopScalarBody = OrigLoop->getHeader(); 3089 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3090 LoopExitBlock = OrigLoop->getExitBlock(); 3091 assert(LoopExitBlock && "Must have an exit block"); 3092 assert(LoopVectorPreHeader && "Invalid loop structure"); 3093 3094 LoopMiddleBlock = 3095 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3096 LI, nullptr, Twine(Prefix) + "middle.block"); 3097 LoopScalarPreHeader = 3098 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3099 nullptr, Twine(Prefix) + "scalar.ph"); 3100 // We intentionally don't let SplitBlock to update LoopInfo since 3101 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3102 // LoopVectorBody is explicitly added to the correct place few lines later. 3103 LoopVectorBody = 3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3105 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3106 3107 // Update dominator for loop exit. 3108 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3109 3110 // Create and register the new vector loop. 3111 Loop *Lp = LI->AllocateLoop(); 3112 Loop *ParentLoop = OrigLoop->getParentLoop(); 3113 3114 // Insert the new loop into the loop nest and register the new basic blocks 3115 // before calling any utilities such as SCEV that require valid LoopInfo. 3116 if (ParentLoop) { 3117 ParentLoop->addChildLoop(Lp); 3118 } else { 3119 LI->addTopLevelLoop(Lp); 3120 } 3121 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3122 return Lp; 3123 } 3124 3125 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3126 Value *VectorTripCount) { 3127 assert(VectorTripCount && L && "Expected valid arguments"); 3128 // We are going to resume the execution of the scalar loop. 3129 // Go over all of the induction variables that we found and fix the 3130 // PHIs that are left in the scalar version of the loop. 3131 // The starting values of PHI nodes depend on the counter of the last 3132 // iteration in the vectorized loop. 3133 // If we come from a bypass edge then we need to start from the original 3134 // start value. 3135 for (auto &InductionEntry : Legal->getInductionVars()) { 3136 PHINode *OrigPhi = InductionEntry.first; 3137 InductionDescriptor II = InductionEntry.second; 3138 3139 // Create phi nodes to merge from the backedge-taken check block. 3140 PHINode *BCResumeVal = 3141 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3142 LoopScalarPreHeader->getTerminator()); 3143 // Copy original phi DL over to the new one. 3144 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3145 Value *&EndValue = IVEndValues[OrigPhi]; 3146 if (OrigPhi == OldInduction) { 3147 // We know what the end value is. 3148 EndValue = VectorTripCount; 3149 } else { 3150 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3151 Type *StepType = II.getStep()->getType(); 3152 Instruction::CastOps CastOp = 3153 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3154 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3155 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3156 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3157 EndValue->setName("ind.end"); 3158 } 3159 3160 // The new PHI merges the original incoming value, in case of a bypass, 3161 // or the value at the end of the vectorized loop. 3162 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3163 3164 // Fix the scalar body counter (PHI node). 3165 // The old induction's phi node in the scalar body needs the truncated 3166 // value. 3167 for (BasicBlock *BB : LoopBypassBlocks) 3168 BCResumeVal->addIncoming(II.getStartValue(), BB); 3169 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3170 } 3171 } 3172 3173 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3174 MDNode *OrigLoopID) { 3175 assert(L && "Expected valid loop."); 3176 3177 // The trip counts should be cached by now. 3178 Value *Count = getOrCreateTripCount(L); 3179 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3180 3181 // We need the OrigLoop (scalar loop part) latch terminator to help 3182 // produce correct debug info for the middle block BB instructions. 3183 // The legality check stage guarantees that the loop will have a single 3184 // latch. 3185 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3186 "Scalar loop latch terminator isn't a branch"); 3187 BranchInst *ScalarLatchBr = 3188 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3189 3190 // Add a check in the middle block to see if we have completed 3191 // all of the iterations in the first vector loop. 3192 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3193 // If tail is to be folded, we know we don't need to run the remainder. 3194 Value *CmpN = Builder.getTrue(); 3195 if (!Cost->foldTailByMasking()) { 3196 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3197 VectorTripCount, "cmp.n", 3198 LoopMiddleBlock->getTerminator()); 3199 3200 // Here we use the same DebugLoc as the scalar loop latch branch instead 3201 // of the corresponding compare because they may have ended up with 3202 // different line numbers and we want to avoid awkward line stepping while 3203 // debugging. Eg. if the compare has got a line number inside the loop. 3204 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3205 } 3206 3207 BranchInst *BrInst = 3208 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3209 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3210 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3211 3212 // Get ready to start creating new instructions into the vectorized body. 3213 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3214 "Inconsistent vector loop preheader"); 3215 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3216 3217 Optional<MDNode *> VectorizedLoopID = 3218 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3219 LLVMLoopVectorizeFollowupVectorized}); 3220 if (VectorizedLoopID.hasValue()) { 3221 L->setLoopID(VectorizedLoopID.getValue()); 3222 3223 // Do not setAlreadyVectorized if loop attributes have been defined 3224 // explicitly. 3225 return LoopVectorPreHeader; 3226 } 3227 3228 // Keep all loop hints from the original loop on the vector loop (we'll 3229 // replace the vectorizer-specific hints below). 3230 if (MDNode *LID = OrigLoop->getLoopID()) 3231 L->setLoopID(LID); 3232 3233 LoopVectorizeHints Hints(L, true, *ORE); 3234 Hints.setAlreadyVectorized(); 3235 3236 #ifdef EXPENSIVE_CHECKS 3237 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3238 LI->verify(*DT); 3239 #endif 3240 3241 return LoopVectorPreHeader; 3242 } 3243 3244 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3245 /* 3246 In this function we generate a new loop. The new loop will contain 3247 the vectorized instructions while the old loop will continue to run the 3248 scalar remainder. 3249 3250 [ ] <-- loop iteration number check. 3251 / | 3252 / v 3253 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3254 | / | 3255 | / v 3256 || [ ] <-- vector pre header. 3257 |/ | 3258 | v 3259 | [ ] \ 3260 | [ ]_| <-- vector loop. 3261 | | 3262 | v 3263 | -[ ] <--- middle-block. 3264 | / | 3265 | / v 3266 -|- >[ ] <--- new preheader. 3267 | | 3268 | v 3269 | [ ] \ 3270 | [ ]_| <-- old scalar loop to handle remainder. 3271 \ | 3272 \ v 3273 >[ ] <-- exit block. 3274 ... 3275 */ 3276 3277 // Get the metadata of the original loop before it gets modified. 3278 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3279 3280 // Create an empty vector loop, and prepare basic blocks for the runtime 3281 // checks. 3282 Loop *Lp = createVectorLoopSkeleton(""); 3283 3284 // Now, compare the new count to zero. If it is zero skip the vector loop and 3285 // jump to the scalar loop. This check also covers the case where the 3286 // backedge-taken count is uint##_max: adding one to it will overflow leading 3287 // to an incorrect trip count of zero. In this (rare) case we will also jump 3288 // to the scalar loop. 3289 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3290 3291 // Generate the code to check any assumptions that we've made for SCEV 3292 // expressions. 3293 emitSCEVChecks(Lp, LoopScalarPreHeader); 3294 3295 // Generate the code that checks in runtime if arrays overlap. We put the 3296 // checks into a separate block to make the more common case of few elements 3297 // faster. 3298 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3299 3300 // Some loops have a single integer induction variable, while other loops 3301 // don't. One example is c++ iterators that often have multiple pointer 3302 // induction variables. In the code below we also support a case where we 3303 // don't have a single induction variable. 3304 // 3305 // We try to obtain an induction variable from the original loop as hard 3306 // as possible. However if we don't find one that: 3307 // - is an integer 3308 // - counts from zero, stepping by one 3309 // - is the size of the widest induction variable type 3310 // then we create a new one. 3311 OldInduction = Legal->getPrimaryInduction(); 3312 Type *IdxTy = Legal->getWidestInductionType(); 3313 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3314 // The loop step is equal to the vectorization factor (num of SIMD elements) 3315 // times the unroll factor (num of SIMD instructions). 3316 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3317 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3318 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3319 Induction = 3320 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3321 getDebugLocFromInstOrOperands(OldInduction)); 3322 3323 // Emit phis for the new starting index of the scalar loop. 3324 createInductionResumeValues(Lp, CountRoundDown); 3325 3326 return completeLoopSkeleton(Lp, OrigLoopID); 3327 } 3328 3329 // Fix up external users of the induction variable. At this point, we are 3330 // in LCSSA form, with all external PHIs that use the IV having one input value, 3331 // coming from the remainder loop. We need those PHIs to also have a correct 3332 // value for the IV when arriving directly from the middle block. 3333 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3334 const InductionDescriptor &II, 3335 Value *CountRoundDown, Value *EndValue, 3336 BasicBlock *MiddleBlock) { 3337 // There are two kinds of external IV usages - those that use the value 3338 // computed in the last iteration (the PHI) and those that use the penultimate 3339 // value (the value that feeds into the phi from the loop latch). 3340 // We allow both, but they, obviously, have different values. 3341 3342 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3343 3344 DenseMap<Value *, Value *> MissingVals; 3345 3346 // An external user of the last iteration's value should see the value that 3347 // the remainder loop uses to initialize its own IV. 3348 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3349 for (User *U : PostInc->users()) { 3350 Instruction *UI = cast<Instruction>(U); 3351 if (!OrigLoop->contains(UI)) { 3352 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3353 MissingVals[UI] = EndValue; 3354 } 3355 } 3356 3357 // An external user of the penultimate value need to see EndValue - Step. 3358 // The simplest way to get this is to recompute it from the constituent SCEVs, 3359 // that is Start + (Step * (CRD - 1)). 3360 for (User *U : OrigPhi->users()) { 3361 auto *UI = cast<Instruction>(U); 3362 if (!OrigLoop->contains(UI)) { 3363 const DataLayout &DL = 3364 OrigLoop->getHeader()->getModule()->getDataLayout(); 3365 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3366 3367 IRBuilder<> B(MiddleBlock->getTerminator()); 3368 Value *CountMinusOne = B.CreateSub( 3369 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3370 Value *CMO = 3371 !II.getStep()->getType()->isIntegerTy() 3372 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3373 II.getStep()->getType()) 3374 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3375 CMO->setName("cast.cmo"); 3376 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3377 Escape->setName("ind.escape"); 3378 MissingVals[UI] = Escape; 3379 } 3380 } 3381 3382 for (auto &I : MissingVals) { 3383 PHINode *PHI = cast<PHINode>(I.first); 3384 // One corner case we have to handle is two IVs "chasing" each-other, 3385 // that is %IV2 = phi [...], [ %IV1, %latch ] 3386 // In this case, if IV1 has an external use, we need to avoid adding both 3387 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3388 // don't already have an incoming value for the middle block. 3389 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3390 PHI->addIncoming(I.second, MiddleBlock); 3391 } 3392 } 3393 3394 namespace { 3395 3396 struct CSEDenseMapInfo { 3397 static bool canHandle(const Instruction *I) { 3398 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3399 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3400 } 3401 3402 static inline Instruction *getEmptyKey() { 3403 return DenseMapInfo<Instruction *>::getEmptyKey(); 3404 } 3405 3406 static inline Instruction *getTombstoneKey() { 3407 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3408 } 3409 3410 static unsigned getHashValue(const Instruction *I) { 3411 assert(canHandle(I) && "Unknown instruction!"); 3412 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3413 I->value_op_end())); 3414 } 3415 3416 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3417 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3418 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3419 return LHS == RHS; 3420 return LHS->isIdenticalTo(RHS); 3421 } 3422 }; 3423 3424 } // end anonymous namespace 3425 3426 ///Perform cse of induction variable instructions. 3427 static void cse(BasicBlock *BB) { 3428 // Perform simple cse. 3429 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3430 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3431 Instruction *In = &*I++; 3432 3433 if (!CSEDenseMapInfo::canHandle(In)) 3434 continue; 3435 3436 // Check if we can replace this instruction with any of the 3437 // visited instructions. 3438 if (Instruction *V = CSEMap.lookup(In)) { 3439 In->replaceAllUsesWith(V); 3440 In->eraseFromParent(); 3441 continue; 3442 } 3443 3444 CSEMap[In] = In; 3445 } 3446 } 3447 3448 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3449 ElementCount VF, 3450 bool &NeedToScalarize) { 3451 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3452 Function *F = CI->getCalledFunction(); 3453 Type *ScalarRetTy = CI->getType(); 3454 SmallVector<Type *, 4> Tys, ScalarTys; 3455 for (auto &ArgOp : CI->arg_operands()) 3456 ScalarTys.push_back(ArgOp->getType()); 3457 3458 // Estimate cost of scalarized vector call. The source operands are assumed 3459 // to be vectors, so we need to extract individual elements from there, 3460 // execute VF scalar calls, and then gather the result into the vector return 3461 // value. 3462 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3463 TTI::TCK_RecipThroughput); 3464 if (VF.isScalar()) 3465 return ScalarCallCost; 3466 3467 // Compute corresponding vector type for return value and arguments. 3468 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3469 for (Type *ScalarTy : ScalarTys) 3470 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3471 3472 // Compute costs of unpacking argument values for the scalar calls and 3473 // packing the return values to a vector. 3474 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3475 3476 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3477 3478 // If we can't emit a vector call for this function, then the currently found 3479 // cost is the cost we need to return. 3480 NeedToScalarize = true; 3481 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3482 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3483 3484 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3485 return Cost; 3486 3487 // If the corresponding vector cost is cheaper, return its cost. 3488 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3489 TTI::TCK_RecipThroughput); 3490 if (VectorCallCost < Cost) { 3491 NeedToScalarize = false; 3492 return VectorCallCost; 3493 } 3494 return Cost; 3495 } 3496 3497 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3498 ElementCount VF) { 3499 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3500 assert(ID && "Expected intrinsic call!"); 3501 3502 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3503 return TTI.getIntrinsicInstrCost(CostAttrs, 3504 TargetTransformInfo::TCK_RecipThroughput); 3505 } 3506 3507 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3508 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3509 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3510 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3511 } 3512 3513 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3516 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3517 } 3518 3519 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3520 // For every instruction `I` in MinBWs, truncate the operands, create a 3521 // truncated version of `I` and reextend its result. InstCombine runs 3522 // later and will remove any ext/trunc pairs. 3523 SmallPtrSet<Value *, 4> Erased; 3524 for (const auto &KV : Cost->getMinimalBitwidths()) { 3525 // If the value wasn't vectorized, we must maintain the original scalar 3526 // type. The absence of the value from VectorLoopValueMap indicates that it 3527 // wasn't vectorized. 3528 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3529 continue; 3530 for (unsigned Part = 0; Part < UF; ++Part) { 3531 Value *I = getOrCreateVectorValue(KV.first, Part); 3532 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3533 continue; 3534 Type *OriginalTy = I->getType(); 3535 Type *ScalarTruncatedTy = 3536 IntegerType::get(OriginalTy->getContext(), KV.second); 3537 auto *TruncatedTy = FixedVectorType::get( 3538 ScalarTruncatedTy, 3539 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3540 if (TruncatedTy == OriginalTy) 3541 continue; 3542 3543 IRBuilder<> B(cast<Instruction>(I)); 3544 auto ShrinkOperand = [&](Value *V) -> Value * { 3545 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3546 if (ZI->getSrcTy() == TruncatedTy) 3547 return ZI->getOperand(0); 3548 return B.CreateZExtOrTrunc(V, TruncatedTy); 3549 }; 3550 3551 // The actual instruction modification depends on the instruction type, 3552 // unfortunately. 3553 Value *NewI = nullptr; 3554 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3555 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3556 ShrinkOperand(BO->getOperand(1))); 3557 3558 // Any wrapping introduced by shrinking this operation shouldn't be 3559 // considered undefined behavior. So, we can't unconditionally copy 3560 // arithmetic wrapping flags to NewI. 3561 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3562 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3563 NewI = 3564 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3565 ShrinkOperand(CI->getOperand(1))); 3566 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3567 NewI = B.CreateSelect(SI->getCondition(), 3568 ShrinkOperand(SI->getTrueValue()), 3569 ShrinkOperand(SI->getFalseValue())); 3570 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3571 switch (CI->getOpcode()) { 3572 default: 3573 llvm_unreachable("Unhandled cast!"); 3574 case Instruction::Trunc: 3575 NewI = ShrinkOperand(CI->getOperand(0)); 3576 break; 3577 case Instruction::SExt: 3578 NewI = B.CreateSExtOrTrunc( 3579 CI->getOperand(0), 3580 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3581 break; 3582 case Instruction::ZExt: 3583 NewI = B.CreateZExtOrTrunc( 3584 CI->getOperand(0), 3585 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3586 break; 3587 } 3588 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3589 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3590 ->getNumElements(); 3591 auto *O0 = B.CreateZExtOrTrunc( 3592 SI->getOperand(0), 3593 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3594 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3595 ->getNumElements(); 3596 auto *O1 = B.CreateZExtOrTrunc( 3597 SI->getOperand(1), 3598 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3599 3600 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3601 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3602 // Don't do anything with the operands, just extend the result. 3603 continue; 3604 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3605 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3606 ->getNumElements(); 3607 auto *O0 = B.CreateZExtOrTrunc( 3608 IE->getOperand(0), 3609 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3610 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3611 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3612 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3613 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3614 ->getNumElements(); 3615 auto *O0 = B.CreateZExtOrTrunc( 3616 EE->getOperand(0), 3617 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3618 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3619 } else { 3620 // If we don't know what to do, be conservative and don't do anything. 3621 continue; 3622 } 3623 3624 // Lastly, extend the result. 3625 NewI->takeName(cast<Instruction>(I)); 3626 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3627 I->replaceAllUsesWith(Res); 3628 cast<Instruction>(I)->eraseFromParent(); 3629 Erased.insert(I); 3630 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3631 } 3632 } 3633 3634 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3635 for (const auto &KV : Cost->getMinimalBitwidths()) { 3636 // If the value wasn't vectorized, we must maintain the original scalar 3637 // type. The absence of the value from VectorLoopValueMap indicates that it 3638 // wasn't vectorized. 3639 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3640 continue; 3641 for (unsigned Part = 0; Part < UF; ++Part) { 3642 Value *I = getOrCreateVectorValue(KV.first, Part); 3643 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3644 if (Inst && Inst->use_empty()) { 3645 Value *NewI = Inst->getOperand(0); 3646 Inst->eraseFromParent(); 3647 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3648 } 3649 } 3650 } 3651 } 3652 3653 void InnerLoopVectorizer::fixVectorizedLoop() { 3654 // Insert truncates and extends for any truncated instructions as hints to 3655 // InstCombine. 3656 if (VF.isVector()) 3657 truncateToMinimalBitwidths(); 3658 3659 // Fix widened non-induction PHIs by setting up the PHI operands. 3660 if (OrigPHIsToFix.size()) { 3661 assert(EnableVPlanNativePath && 3662 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3663 fixNonInductionPHIs(); 3664 } 3665 3666 // At this point every instruction in the original loop is widened to a 3667 // vector form. Now we need to fix the recurrences in the loop. These PHI 3668 // nodes are currently empty because we did not want to introduce cycles. 3669 // This is the second stage of vectorizing recurrences. 3670 fixCrossIterationPHIs(); 3671 3672 // Forget the original basic block. 3673 PSE.getSE()->forgetLoop(OrigLoop); 3674 3675 // Fix-up external users of the induction variables. 3676 for (auto &Entry : Legal->getInductionVars()) 3677 fixupIVUsers(Entry.first, Entry.second, 3678 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3679 IVEndValues[Entry.first], LoopMiddleBlock); 3680 3681 fixLCSSAPHIs(); 3682 for (Instruction *PI : PredicatedInstructions) 3683 sinkScalarOperands(&*PI); 3684 3685 // Remove redundant induction instructions. 3686 cse(LoopVectorBody); 3687 3688 // Set/update profile weights for the vector and remainder loops as original 3689 // loop iterations are now distributed among them. Note that original loop 3690 // represented by LoopScalarBody becomes remainder loop after vectorization. 3691 // 3692 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3693 // end up getting slightly roughened result but that should be OK since 3694 // profile is not inherently precise anyway. Note also possible bypass of 3695 // vector code caused by legality checks is ignored, assigning all the weight 3696 // to the vector loop, optimistically. 3697 assert(!VF.isScalable() && 3698 "cannot use scalable ElementCount to determine unroll factor"); 3699 setProfileInfoAfterUnrolling( 3700 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3701 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3702 } 3703 3704 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3705 // In order to support recurrences we need to be able to vectorize Phi nodes. 3706 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3707 // stage #2: We now need to fix the recurrences by adding incoming edges to 3708 // the currently empty PHI nodes. At this point every instruction in the 3709 // original loop is widened to a vector form so we can use them to construct 3710 // the incoming edges. 3711 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3712 // Handle first-order recurrences and reductions that need to be fixed. 3713 if (Legal->isFirstOrderRecurrence(&Phi)) 3714 fixFirstOrderRecurrence(&Phi); 3715 else if (Legal->isReductionVariable(&Phi)) 3716 fixReduction(&Phi); 3717 } 3718 } 3719 3720 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3721 // This is the second phase of vectorizing first-order recurrences. An 3722 // overview of the transformation is described below. Suppose we have the 3723 // following loop. 3724 // 3725 // for (int i = 0; i < n; ++i) 3726 // b[i] = a[i] - a[i - 1]; 3727 // 3728 // There is a first-order recurrence on "a". For this loop, the shorthand 3729 // scalar IR looks like: 3730 // 3731 // scalar.ph: 3732 // s_init = a[-1] 3733 // br scalar.body 3734 // 3735 // scalar.body: 3736 // i = phi [0, scalar.ph], [i+1, scalar.body] 3737 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3738 // s2 = a[i] 3739 // b[i] = s2 - s1 3740 // br cond, scalar.body, ... 3741 // 3742 // In this example, s1 is a recurrence because it's value depends on the 3743 // previous iteration. In the first phase of vectorization, we created a 3744 // temporary value for s1. We now complete the vectorization and produce the 3745 // shorthand vector IR shown below (for VF = 4, UF = 1). 3746 // 3747 // vector.ph: 3748 // v_init = vector(..., ..., ..., a[-1]) 3749 // br vector.body 3750 // 3751 // vector.body 3752 // i = phi [0, vector.ph], [i+4, vector.body] 3753 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3754 // v2 = a[i, i+1, i+2, i+3]; 3755 // v3 = vector(v1(3), v2(0, 1, 2)) 3756 // b[i, i+1, i+2, i+3] = v2 - v3 3757 // br cond, vector.body, middle.block 3758 // 3759 // middle.block: 3760 // x = v2(3) 3761 // br scalar.ph 3762 // 3763 // scalar.ph: 3764 // s_init = phi [x, middle.block], [a[-1], otherwise] 3765 // br scalar.body 3766 // 3767 // After execution completes the vector loop, we extract the next value of 3768 // the recurrence (x) to use as the initial value in the scalar loop. 3769 3770 // Get the original loop preheader and single loop latch. 3771 auto *Preheader = OrigLoop->getLoopPreheader(); 3772 auto *Latch = OrigLoop->getLoopLatch(); 3773 3774 // Get the initial and previous values of the scalar recurrence. 3775 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3776 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3777 3778 // Create a vector from the initial value. 3779 auto *VectorInit = ScalarInit; 3780 if (VF.isVector()) { 3781 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3782 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3783 VectorInit = Builder.CreateInsertElement( 3784 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3785 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3786 } 3787 3788 // We constructed a temporary phi node in the first phase of vectorization. 3789 // This phi node will eventually be deleted. 3790 Builder.SetInsertPoint( 3791 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3792 3793 // Create a phi node for the new recurrence. The current value will either be 3794 // the initial value inserted into a vector or loop-varying vector value. 3795 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3796 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3797 3798 // Get the vectorized previous value of the last part UF - 1. It appears last 3799 // among all unrolled iterations, due to the order of their construction. 3800 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3801 3802 // Find and set the insertion point after the previous value if it is an 3803 // instruction. 3804 BasicBlock::iterator InsertPt; 3805 // Note that the previous value may have been constant-folded so it is not 3806 // guaranteed to be an instruction in the vector loop. 3807 // FIXME: Loop invariant values do not form recurrences. We should deal with 3808 // them earlier. 3809 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3810 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3811 else { 3812 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3813 if (isa<PHINode>(PreviousLastPart)) 3814 // If the previous value is a phi node, we should insert after all the phi 3815 // nodes in the block containing the PHI to avoid breaking basic block 3816 // verification. Note that the basic block may be different to 3817 // LoopVectorBody, in case we predicate the loop. 3818 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3819 else 3820 InsertPt = ++PreviousInst->getIterator(); 3821 } 3822 Builder.SetInsertPoint(&*InsertPt); 3823 3824 // We will construct a vector for the recurrence by combining the values for 3825 // the current and previous iterations. This is the required shuffle mask. 3826 assert(!VF.isScalable()); 3827 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3828 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3829 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3830 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3831 3832 // The vector from which to take the initial value for the current iteration 3833 // (actual or unrolled). Initially, this is the vector phi node. 3834 Value *Incoming = VecPhi; 3835 3836 // Shuffle the current and previous vector and update the vector parts. 3837 for (unsigned Part = 0; Part < UF; ++Part) { 3838 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3839 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3840 auto *Shuffle = 3841 VF.isVector() 3842 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3843 : Incoming; 3844 PhiPart->replaceAllUsesWith(Shuffle); 3845 cast<Instruction>(PhiPart)->eraseFromParent(); 3846 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3847 Incoming = PreviousPart; 3848 } 3849 3850 // Fix the latch value of the new recurrence in the vector loop. 3851 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3852 3853 // Extract the last vector element in the middle block. This will be the 3854 // initial value for the recurrence when jumping to the scalar loop. 3855 auto *ExtractForScalar = Incoming; 3856 if (VF.isVector()) { 3857 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3858 ExtractForScalar = Builder.CreateExtractElement( 3859 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3860 "vector.recur.extract"); 3861 } 3862 // Extract the second last element in the middle block if the 3863 // Phi is used outside the loop. We need to extract the phi itself 3864 // and not the last element (the phi update in the current iteration). This 3865 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3866 // when the scalar loop is not run at all. 3867 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3868 if (VF.isVector()) 3869 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3870 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3871 "vector.recur.extract.for.phi"); 3872 // When loop is unrolled without vectorizing, initialize 3873 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3874 // `Incoming`. This is analogous to the vectorized case above: extracting the 3875 // second last element when VF > 1. 3876 else if (UF > 1) 3877 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3878 3879 // Fix the initial value of the original recurrence in the scalar loop. 3880 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3881 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3882 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3883 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3884 Start->addIncoming(Incoming, BB); 3885 } 3886 3887 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3888 Phi->setName("scalar.recur"); 3889 3890 // Finally, fix users of the recurrence outside the loop. The users will need 3891 // either the last value of the scalar recurrence or the last value of the 3892 // vector recurrence we extracted in the middle block. Since the loop is in 3893 // LCSSA form, we just need to find all the phi nodes for the original scalar 3894 // recurrence in the exit block, and then add an edge for the middle block. 3895 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3896 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3897 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3898 } 3899 } 3900 } 3901 3902 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3903 Constant *Zero = Builder.getInt32(0); 3904 3905 // Get it's reduction variable descriptor. 3906 assert(Legal->isReductionVariable(Phi) && 3907 "Unable to find the reduction variable"); 3908 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3909 3910 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3911 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3912 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3913 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3914 RdxDesc.getMinMaxRecurrenceKind(); 3915 setDebugLocFromInst(Builder, ReductionStartValue); 3916 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3917 3918 // We need to generate a reduction vector from the incoming scalar. 3919 // To do so, we need to generate the 'identity' vector and override 3920 // one of the elements with the incoming scalar reduction. We need 3921 // to do it in the vector-loop preheader. 3922 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3923 3924 // This is the vector-clone of the value that leaves the loop. 3925 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3926 3927 // Find the reduction identity variable. Zero for addition, or, xor, 3928 // one for multiplication, -1 for And. 3929 Value *Identity; 3930 Value *VectorStart; 3931 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3932 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3933 // MinMax reduction have the start value as their identify. 3934 if (VF == 1 || IsInLoopReductionPhi) { 3935 VectorStart = Identity = ReductionStartValue; 3936 } else { 3937 VectorStart = Identity = 3938 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3939 } 3940 } else { 3941 // Handle other reduction kinds: 3942 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3943 RK, VecTy->getScalarType()); 3944 if (VF == 1 || IsInLoopReductionPhi) { 3945 Identity = Iden; 3946 // This vector is the Identity vector where the first element is the 3947 // incoming scalar reduction. 3948 VectorStart = ReductionStartValue; 3949 } else { 3950 Identity = ConstantVector::getSplat(VF, Iden); 3951 3952 // This vector is the Identity vector where the first element is the 3953 // incoming scalar reduction. 3954 VectorStart = 3955 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3956 } 3957 } 3958 3959 // Wrap flags are in general invalid after vectorization, clear them. 3960 clearReductionWrapFlags(RdxDesc); 3961 3962 // Fix the vector-loop phi. 3963 3964 // Reductions do not have to start at zero. They can start with 3965 // any loop invariant values. 3966 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3967 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3968 3969 for (unsigned Part = 0; Part < UF; ++Part) { 3970 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3971 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3972 // Make sure to add the reduction start value only to the 3973 // first unroll part. 3974 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3975 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3976 cast<PHINode>(VecRdxPhi) 3977 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3978 } 3979 3980 // Before each round, move the insertion point right between 3981 // the PHIs and the values we are going to write. 3982 // This allows us to write both PHINodes and the extractelement 3983 // instructions. 3984 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3985 3986 setDebugLocFromInst(Builder, LoopExitInst); 3987 3988 // If tail is folded by masking, the vector value to leave the loop should be 3989 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3990 // instead of the former. 3991 if (Cost->foldTailByMasking()) { 3992 for (unsigned Part = 0; Part < UF; ++Part) { 3993 Value *VecLoopExitInst = 3994 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3995 Value *Sel = nullptr; 3996 for (User *U : VecLoopExitInst->users()) { 3997 if (isa<SelectInst>(U)) { 3998 assert(!Sel && "Reduction exit feeding two selects"); 3999 Sel = U; 4000 } else 4001 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4002 } 4003 assert(Sel && "Reduction exit feeds no select"); 4004 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4005 4006 // If the target can create a predicated operator for the reduction at no 4007 // extra cost in the loop (for example a predicated vadd), it can be 4008 // cheaper for the select to remain in the loop than be sunk out of it, 4009 // and so use the select value for the phi instead of the old 4010 // LoopExitValue. 4011 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4012 if (PreferPredicatedReductionSelect || 4013 TTI->preferPredicatedReductionSelect( 4014 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4015 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4016 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4017 VecRdxPhi->setIncomingValueForBlock( 4018 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4019 } 4020 } 4021 } 4022 4023 // If the vector reduction can be performed in a smaller type, we truncate 4024 // then extend the loop exit value to enable InstCombine to evaluate the 4025 // entire expression in the smaller type. 4026 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4027 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4028 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4029 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4030 Builder.SetInsertPoint( 4031 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4032 VectorParts RdxParts(UF); 4033 for (unsigned Part = 0; Part < UF; ++Part) { 4034 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4035 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4036 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4037 : Builder.CreateZExt(Trunc, VecTy); 4038 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4039 UI != RdxParts[Part]->user_end();) 4040 if (*UI != Trunc) { 4041 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4042 RdxParts[Part] = Extnd; 4043 } else { 4044 ++UI; 4045 } 4046 } 4047 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4048 for (unsigned Part = 0; Part < UF; ++Part) { 4049 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4050 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4051 } 4052 } 4053 4054 // Reduce all of the unrolled parts into a single vector. 4055 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4056 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4057 4058 // The middle block terminator has already been assigned a DebugLoc here (the 4059 // OrigLoop's single latch terminator). We want the whole middle block to 4060 // appear to execute on this line because: (a) it is all compiler generated, 4061 // (b) these instructions are always executed after evaluating the latch 4062 // conditional branch, and (c) other passes may add new predecessors which 4063 // terminate on this line. This is the easiest way to ensure we don't 4064 // accidentally cause an extra step back into the loop while debugging. 4065 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4066 for (unsigned Part = 1; Part < UF; ++Part) { 4067 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4068 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4069 // Floating point operations had to be 'fast' to enable the reduction. 4070 ReducedPartRdx = addFastMathFlag( 4071 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4072 ReducedPartRdx, "bin.rdx"), 4073 RdxDesc.getFastMathFlags()); 4074 else 4075 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4076 RdxPart); 4077 } 4078 4079 // Create the reduction after the loop. Note that inloop reductions create the 4080 // target reduction in the loop using a Reduction recipe. 4081 if (VF.isVector() && !IsInLoopReductionPhi) { 4082 bool NoNaN = Legal->hasFunNoNaNAttr(); 4083 ReducedPartRdx = 4084 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4085 // If the reduction can be performed in a smaller type, we need to extend 4086 // the reduction to the wider type before we branch to the original loop. 4087 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4088 ReducedPartRdx = 4089 RdxDesc.isSigned() 4090 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4091 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4092 } 4093 4094 // Create a phi node that merges control-flow from the backedge-taken check 4095 // block and the middle block. 4096 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4097 LoopScalarPreHeader->getTerminator()); 4098 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4099 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4100 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4101 4102 // Now, we need to fix the users of the reduction variable 4103 // inside and outside of the scalar remainder loop. 4104 // We know that the loop is in LCSSA form. We need to update the 4105 // PHI nodes in the exit blocks. 4106 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4107 // All PHINodes need to have a single entry edge, or two if 4108 // we already fixed them. 4109 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4110 4111 // We found a reduction value exit-PHI. Update it with the 4112 // incoming bypass edge. 4113 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4114 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4115 } // end of the LCSSA phi scan. 4116 4117 // Fix the scalar loop reduction variable with the incoming reduction sum 4118 // from the vector body and from the backedge value. 4119 int IncomingEdgeBlockIdx = 4120 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4121 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4122 // Pick the other block. 4123 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4124 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4125 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4126 } 4127 4128 void InnerLoopVectorizer::clearReductionWrapFlags( 4129 RecurrenceDescriptor &RdxDesc) { 4130 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4131 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4132 RK != RecurrenceDescriptor::RK_IntegerMult) 4133 return; 4134 4135 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4136 assert(LoopExitInstr && "null loop exit instruction"); 4137 SmallVector<Instruction *, 8> Worklist; 4138 SmallPtrSet<Instruction *, 8> Visited; 4139 Worklist.push_back(LoopExitInstr); 4140 Visited.insert(LoopExitInstr); 4141 4142 while (!Worklist.empty()) { 4143 Instruction *Cur = Worklist.pop_back_val(); 4144 if (isa<OverflowingBinaryOperator>(Cur)) 4145 for (unsigned Part = 0; Part < UF; ++Part) { 4146 Value *V = getOrCreateVectorValue(Cur, Part); 4147 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4148 } 4149 4150 for (User *U : Cur->users()) { 4151 Instruction *UI = cast<Instruction>(U); 4152 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4153 Visited.insert(UI).second) 4154 Worklist.push_back(UI); 4155 } 4156 } 4157 } 4158 4159 void InnerLoopVectorizer::fixLCSSAPHIs() { 4160 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4161 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4162 if (LCSSAPhi.getNumIncomingValues() == 1) { 4163 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4164 // Non-instruction incoming values will have only one value. 4165 unsigned LastLane = 0; 4166 if (isa<Instruction>(IncomingValue)) 4167 LastLane = Cost->isUniformAfterVectorization( 4168 cast<Instruction>(IncomingValue), VF) 4169 ? 0 4170 : VF.getKnownMinValue() - 1; 4171 // Can be a loop invariant incoming value or the last scalar value to be 4172 // extracted from the vectorized loop. 4173 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4174 Value *lastIncomingValue = 4175 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4176 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4177 } 4178 } 4179 } 4180 4181 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4182 // The basic block and loop containing the predicated instruction. 4183 auto *PredBB = PredInst->getParent(); 4184 auto *VectorLoop = LI->getLoopFor(PredBB); 4185 4186 // Initialize a worklist with the operands of the predicated instruction. 4187 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4188 4189 // Holds instructions that we need to analyze again. An instruction may be 4190 // reanalyzed if we don't yet know if we can sink it or not. 4191 SmallVector<Instruction *, 8> InstsToReanalyze; 4192 4193 // Returns true if a given use occurs in the predicated block. Phi nodes use 4194 // their operands in their corresponding predecessor blocks. 4195 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4196 auto *I = cast<Instruction>(U.getUser()); 4197 BasicBlock *BB = I->getParent(); 4198 if (auto *Phi = dyn_cast<PHINode>(I)) 4199 BB = Phi->getIncomingBlock( 4200 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4201 return BB == PredBB; 4202 }; 4203 4204 // Iteratively sink the scalarized operands of the predicated instruction 4205 // into the block we created for it. When an instruction is sunk, it's 4206 // operands are then added to the worklist. The algorithm ends after one pass 4207 // through the worklist doesn't sink a single instruction. 4208 bool Changed; 4209 do { 4210 // Add the instructions that need to be reanalyzed to the worklist, and 4211 // reset the changed indicator. 4212 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4213 InstsToReanalyze.clear(); 4214 Changed = false; 4215 4216 while (!Worklist.empty()) { 4217 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4218 4219 // We can't sink an instruction if it is a phi node, is already in the 4220 // predicated block, is not in the loop, or may have side effects. 4221 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4222 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4223 continue; 4224 4225 // It's legal to sink the instruction if all its uses occur in the 4226 // predicated block. Otherwise, there's nothing to do yet, and we may 4227 // need to reanalyze the instruction. 4228 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4229 InstsToReanalyze.push_back(I); 4230 continue; 4231 } 4232 4233 // Move the instruction to the beginning of the predicated block, and add 4234 // it's operands to the worklist. 4235 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4236 Worklist.insert(I->op_begin(), I->op_end()); 4237 4238 // The sinking may have enabled other instructions to be sunk, so we will 4239 // need to iterate. 4240 Changed = true; 4241 } 4242 } while (Changed); 4243 } 4244 4245 void InnerLoopVectorizer::fixNonInductionPHIs() { 4246 for (PHINode *OrigPhi : OrigPHIsToFix) { 4247 PHINode *NewPhi = 4248 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4249 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4250 4251 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4252 predecessors(OrigPhi->getParent())); 4253 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4254 predecessors(NewPhi->getParent())); 4255 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4256 "Scalar and Vector BB should have the same number of predecessors"); 4257 4258 // The insertion point in Builder may be invalidated by the time we get 4259 // here. Force the Builder insertion point to something valid so that we do 4260 // not run into issues during insertion point restore in 4261 // getOrCreateVectorValue calls below. 4262 Builder.SetInsertPoint(NewPhi); 4263 4264 // The predecessor order is preserved and we can rely on mapping between 4265 // scalar and vector block predecessors. 4266 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4267 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4268 4269 // When looking up the new scalar/vector values to fix up, use incoming 4270 // values from original phi. 4271 Value *ScIncV = 4272 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4273 4274 // Scalar incoming value may need a broadcast 4275 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4276 NewPhi->addIncoming(NewIncV, NewPredBB); 4277 } 4278 } 4279 } 4280 4281 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4282 unsigned UF, ElementCount VF, 4283 bool IsPtrLoopInvariant, 4284 SmallBitVector &IsIndexLoopInvariant, 4285 VPTransformState &State) { 4286 // Construct a vector GEP by widening the operands of the scalar GEP as 4287 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4288 // results in a vector of pointers when at least one operand of the GEP 4289 // is vector-typed. Thus, to keep the representation compact, we only use 4290 // vector-typed operands for loop-varying values. 4291 4292 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4293 // If we are vectorizing, but the GEP has only loop-invariant operands, 4294 // the GEP we build (by only using vector-typed operands for 4295 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4296 // produce a vector of pointers, we need to either arbitrarily pick an 4297 // operand to broadcast, or broadcast a clone of the original GEP. 4298 // Here, we broadcast a clone of the original. 4299 // 4300 // TODO: If at some point we decide to scalarize instructions having 4301 // loop-invariant operands, this special case will no longer be 4302 // required. We would add the scalarization decision to 4303 // collectLoopScalars() and teach getVectorValue() to broadcast 4304 // the lane-zero scalar value. 4305 auto *Clone = Builder.Insert(GEP->clone()); 4306 for (unsigned Part = 0; Part < UF; ++Part) { 4307 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4308 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4309 addMetadata(EntryPart, GEP); 4310 } 4311 } else { 4312 // If the GEP has at least one loop-varying operand, we are sure to 4313 // produce a vector of pointers. But if we are only unrolling, we want 4314 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4315 // produce with the code below will be scalar (if VF == 1) or vector 4316 // (otherwise). Note that for the unroll-only case, we still maintain 4317 // values in the vector mapping with initVector, as we do for other 4318 // instructions. 4319 for (unsigned Part = 0; Part < UF; ++Part) { 4320 // The pointer operand of the new GEP. If it's loop-invariant, we 4321 // won't broadcast it. 4322 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4323 : State.get(Operands.getOperand(0), Part); 4324 4325 // Collect all the indices for the new GEP. If any index is 4326 // loop-invariant, we won't broadcast it. 4327 SmallVector<Value *, 4> Indices; 4328 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4329 VPValue *Operand = Operands.getOperand(I); 4330 if (IsIndexLoopInvariant[I - 1]) 4331 Indices.push_back(State.get(Operand, {0, 0})); 4332 else 4333 Indices.push_back(State.get(Operand, Part)); 4334 } 4335 4336 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4337 // but it should be a vector, otherwise. 4338 auto *NewGEP = 4339 GEP->isInBounds() 4340 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4341 Indices) 4342 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4343 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4344 "NewGEP is not a pointer vector"); 4345 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4346 addMetadata(NewGEP, GEP); 4347 } 4348 } 4349 } 4350 4351 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4352 ElementCount VF) { 4353 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4354 PHINode *P = cast<PHINode>(PN); 4355 if (EnableVPlanNativePath) { 4356 // Currently we enter here in the VPlan-native path for non-induction 4357 // PHIs where all control flow is uniform. We simply widen these PHIs. 4358 // Create a vector phi with no operands - the vector phi operands will be 4359 // set at the end of vector code generation. 4360 Type *VecTy = 4361 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4362 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4363 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4364 OrigPHIsToFix.push_back(P); 4365 4366 return; 4367 } 4368 4369 assert(PN->getParent() == OrigLoop->getHeader() && 4370 "Non-header phis should have been handled elsewhere"); 4371 4372 // In order to support recurrences we need to be able to vectorize Phi nodes. 4373 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4374 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4375 // this value when we vectorize all of the instructions that use the PHI. 4376 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4377 for (unsigned Part = 0; Part < UF; ++Part) { 4378 // This is phase one of vectorizing PHIs. 4379 bool ScalarPHI = 4380 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4381 Type *VecTy = 4382 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4383 Value *EntryPart = PHINode::Create( 4384 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4385 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4386 } 4387 return; 4388 } 4389 4390 setDebugLocFromInst(Builder, P); 4391 4392 // This PHINode must be an induction variable. 4393 // Make sure that we know about it. 4394 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4395 4396 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4397 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4398 4399 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4400 // which can be found from the original scalar operations. 4401 switch (II.getKind()) { 4402 case InductionDescriptor::IK_NoInduction: 4403 llvm_unreachable("Unknown induction"); 4404 case InductionDescriptor::IK_IntInduction: 4405 case InductionDescriptor::IK_FpInduction: 4406 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4407 case InductionDescriptor::IK_PtrInduction: { 4408 // Handle the pointer induction variable case. 4409 assert(P->getType()->isPointerTy() && "Unexpected type."); 4410 4411 if (Cost->isScalarAfterVectorization(P, VF)) { 4412 // This is the normalized GEP that starts counting at zero. 4413 Value *PtrInd = 4414 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4415 // Determine the number of scalars we need to generate for each unroll 4416 // iteration. If the instruction is uniform, we only need to generate the 4417 // first lane. Otherwise, we generate all VF values. 4418 unsigned Lanes = 4419 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4420 for (unsigned Part = 0; Part < UF; ++Part) { 4421 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4422 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4423 Lane + Part * VF.getKnownMinValue()); 4424 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4425 Value *SclrGep = 4426 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4427 SclrGep->setName("next.gep"); 4428 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4429 } 4430 } 4431 return; 4432 } 4433 assert(isa<SCEVConstant>(II.getStep()) && 4434 "Induction step not a SCEV constant!"); 4435 Type *PhiType = II.getStep()->getType(); 4436 4437 // Build a pointer phi 4438 Value *ScalarStartValue = II.getStartValue(); 4439 Type *ScStValueType = ScalarStartValue->getType(); 4440 PHINode *NewPointerPhi = 4441 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4442 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4443 4444 // A pointer induction, performed by using a gep 4445 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4446 Instruction *InductionLoc = LoopLatch->getTerminator(); 4447 const SCEV *ScalarStep = II.getStep(); 4448 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4449 Value *ScalarStepValue = 4450 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4451 Value *InductionGEP = GetElementPtrInst::Create( 4452 ScStValueType->getPointerElementType(), NewPointerPhi, 4453 Builder.CreateMul( 4454 ScalarStepValue, 4455 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4456 "ptr.ind", InductionLoc); 4457 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4458 4459 // Create UF many actual address geps that use the pointer 4460 // phi as base and a vectorized version of the step value 4461 // (<step*0, ..., step*N>) as offset. 4462 for (unsigned Part = 0; Part < UF; ++Part) { 4463 SmallVector<Constant *, 8> Indices; 4464 // Create a vector of consecutive numbers from zero to VF. 4465 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4466 Indices.push_back( 4467 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4468 Constant *StartOffset = ConstantVector::get(Indices); 4469 4470 Value *GEP = Builder.CreateGEP( 4471 ScStValueType->getPointerElementType(), NewPointerPhi, 4472 Builder.CreateMul( 4473 StartOffset, 4474 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4475 "vector.gep")); 4476 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4477 } 4478 } 4479 } 4480 } 4481 4482 /// A helper function for checking whether an integer division-related 4483 /// instruction may divide by zero (in which case it must be predicated if 4484 /// executed conditionally in the scalar code). 4485 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4486 /// Non-zero divisors that are non compile-time constants will not be 4487 /// converted into multiplication, so we will still end up scalarizing 4488 /// the division, but can do so w/o predication. 4489 static bool mayDivideByZero(Instruction &I) { 4490 assert((I.getOpcode() == Instruction::UDiv || 4491 I.getOpcode() == Instruction::SDiv || 4492 I.getOpcode() == Instruction::URem || 4493 I.getOpcode() == Instruction::SRem) && 4494 "Unexpected instruction"); 4495 Value *Divisor = I.getOperand(1); 4496 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4497 return !CInt || CInt->isZero(); 4498 } 4499 4500 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4501 VPTransformState &State) { 4502 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4503 switch (I.getOpcode()) { 4504 case Instruction::Call: 4505 case Instruction::Br: 4506 case Instruction::PHI: 4507 case Instruction::GetElementPtr: 4508 case Instruction::Select: 4509 llvm_unreachable("This instruction is handled by a different recipe."); 4510 case Instruction::UDiv: 4511 case Instruction::SDiv: 4512 case Instruction::SRem: 4513 case Instruction::URem: 4514 case Instruction::Add: 4515 case Instruction::FAdd: 4516 case Instruction::Sub: 4517 case Instruction::FSub: 4518 case Instruction::FNeg: 4519 case Instruction::Mul: 4520 case Instruction::FMul: 4521 case Instruction::FDiv: 4522 case Instruction::FRem: 4523 case Instruction::Shl: 4524 case Instruction::LShr: 4525 case Instruction::AShr: 4526 case Instruction::And: 4527 case Instruction::Or: 4528 case Instruction::Xor: { 4529 // Just widen unops and binops. 4530 setDebugLocFromInst(Builder, &I); 4531 4532 for (unsigned Part = 0; Part < UF; ++Part) { 4533 SmallVector<Value *, 2> Ops; 4534 for (VPValue *VPOp : User.operands()) 4535 Ops.push_back(State.get(VPOp, Part)); 4536 4537 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4538 4539 if (auto *VecOp = dyn_cast<Instruction>(V)) 4540 VecOp->copyIRFlags(&I); 4541 4542 // Use this vector value for all users of the original instruction. 4543 VectorLoopValueMap.setVectorValue(&I, Part, V); 4544 addMetadata(V, &I); 4545 } 4546 4547 break; 4548 } 4549 case Instruction::ICmp: 4550 case Instruction::FCmp: { 4551 // Widen compares. Generate vector compares. 4552 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4553 auto *Cmp = cast<CmpInst>(&I); 4554 setDebugLocFromInst(Builder, Cmp); 4555 for (unsigned Part = 0; Part < UF; ++Part) { 4556 Value *A = State.get(User.getOperand(0), Part); 4557 Value *B = State.get(User.getOperand(1), Part); 4558 Value *C = nullptr; 4559 if (FCmp) { 4560 // Propagate fast math flags. 4561 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4562 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4563 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4564 } else { 4565 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4566 } 4567 VectorLoopValueMap.setVectorValue(&I, Part, C); 4568 addMetadata(C, &I); 4569 } 4570 4571 break; 4572 } 4573 4574 case Instruction::ZExt: 4575 case Instruction::SExt: 4576 case Instruction::FPToUI: 4577 case Instruction::FPToSI: 4578 case Instruction::FPExt: 4579 case Instruction::PtrToInt: 4580 case Instruction::IntToPtr: 4581 case Instruction::SIToFP: 4582 case Instruction::UIToFP: 4583 case Instruction::Trunc: 4584 case Instruction::FPTrunc: 4585 case Instruction::BitCast: { 4586 auto *CI = cast<CastInst>(&I); 4587 setDebugLocFromInst(Builder, CI); 4588 4589 /// Vectorize casts. 4590 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4591 Type *DestTy = 4592 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4593 4594 for (unsigned Part = 0; Part < UF; ++Part) { 4595 Value *A = State.get(User.getOperand(0), Part); 4596 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4597 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4598 addMetadata(Cast, &I); 4599 } 4600 break; 4601 } 4602 default: 4603 // This instruction is not vectorized by simple widening. 4604 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4605 llvm_unreachable("Unhandled instruction!"); 4606 } // end of switch. 4607 } 4608 4609 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4610 VPTransformState &State) { 4611 assert(!isa<DbgInfoIntrinsic>(I) && 4612 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4613 setDebugLocFromInst(Builder, &I); 4614 4615 Module *M = I.getParent()->getParent()->getParent(); 4616 auto *CI = cast<CallInst>(&I); 4617 4618 SmallVector<Type *, 4> Tys; 4619 for (Value *ArgOperand : CI->arg_operands()) 4620 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4621 4622 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4623 4624 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4625 // version of the instruction. 4626 // Is it beneficial to perform intrinsic call compared to lib call? 4627 bool NeedToScalarize = false; 4628 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4629 bool UseVectorIntrinsic = 4630 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4631 assert((UseVectorIntrinsic || !NeedToScalarize) && 4632 "Instruction should be scalarized elsewhere."); 4633 4634 for (unsigned Part = 0; Part < UF; ++Part) { 4635 SmallVector<Value *, 4> Args; 4636 for (auto &I : enumerate(ArgOperands.operands())) { 4637 // Some intrinsics have a scalar argument - don't replace it with a 4638 // vector. 4639 Value *Arg; 4640 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4641 Arg = State.get(I.value(), Part); 4642 else 4643 Arg = State.get(I.value(), {0, 0}); 4644 Args.push_back(Arg); 4645 } 4646 4647 Function *VectorF; 4648 if (UseVectorIntrinsic) { 4649 // Use vector version of the intrinsic. 4650 Type *TysForDecl[] = {CI->getType()}; 4651 if (VF.isVector()) { 4652 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4653 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4654 } 4655 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4656 assert(VectorF && "Can't retrieve vector intrinsic."); 4657 } else { 4658 // Use vector version of the function call. 4659 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4660 #ifndef NDEBUG 4661 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4662 "Can't create vector function."); 4663 #endif 4664 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4665 } 4666 SmallVector<OperandBundleDef, 1> OpBundles; 4667 CI->getOperandBundlesAsDefs(OpBundles); 4668 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4669 4670 if (isa<FPMathOperator>(V)) 4671 V->copyFastMathFlags(CI); 4672 4673 VectorLoopValueMap.setVectorValue(&I, Part, V); 4674 addMetadata(V, &I); 4675 } 4676 } 4677 4678 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4679 VPUser &Operands, 4680 bool InvariantCond, 4681 VPTransformState &State) { 4682 setDebugLocFromInst(Builder, &I); 4683 4684 // The condition can be loop invariant but still defined inside the 4685 // loop. This means that we can't just use the original 'cond' value. 4686 // We have to take the 'vectorized' value and pick the first lane. 4687 // Instcombine will make this a no-op. 4688 auto *InvarCond = 4689 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4690 4691 for (unsigned Part = 0; Part < UF; ++Part) { 4692 Value *Cond = 4693 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4694 Value *Op0 = State.get(Operands.getOperand(1), Part); 4695 Value *Op1 = State.get(Operands.getOperand(2), Part); 4696 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4697 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4698 addMetadata(Sel, &I); 4699 } 4700 } 4701 4702 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4703 // We should not collect Scalars more than once per VF. Right now, this 4704 // function is called from collectUniformsAndScalars(), which already does 4705 // this check. Collecting Scalars for VF=1 does not make any sense. 4706 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4707 "This function should not be visited twice for the same VF"); 4708 4709 SmallSetVector<Instruction *, 8> Worklist; 4710 4711 // These sets are used to seed the analysis with pointers used by memory 4712 // accesses that will remain scalar. 4713 SmallSetVector<Instruction *, 8> ScalarPtrs; 4714 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4715 auto *Latch = TheLoop->getLoopLatch(); 4716 4717 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4718 // The pointer operands of loads and stores will be scalar as long as the 4719 // memory access is not a gather or scatter operation. The value operand of a 4720 // store will remain scalar if the store is scalarized. 4721 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4722 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4723 assert(WideningDecision != CM_Unknown && 4724 "Widening decision should be ready at this moment"); 4725 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4726 if (Ptr == Store->getValueOperand()) 4727 return WideningDecision == CM_Scalarize; 4728 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4729 "Ptr is neither a value or pointer operand"); 4730 return WideningDecision != CM_GatherScatter; 4731 }; 4732 4733 // A helper that returns true if the given value is a bitcast or 4734 // getelementptr instruction contained in the loop. 4735 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4736 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4737 isa<GetElementPtrInst>(V)) && 4738 !TheLoop->isLoopInvariant(V); 4739 }; 4740 4741 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4742 if (!isa<PHINode>(Ptr) || 4743 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4744 return false; 4745 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4746 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4747 return false; 4748 return isScalarUse(MemAccess, Ptr); 4749 }; 4750 4751 // A helper that evaluates a memory access's use of a pointer. If the 4752 // pointer is actually the pointer induction of a loop, it is being 4753 // inserted into Worklist. If the use will be a scalar use, and the 4754 // pointer is only used by memory accesses, we place the pointer in 4755 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4756 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4757 if (isScalarPtrInduction(MemAccess, Ptr)) { 4758 Worklist.insert(cast<Instruction>(Ptr)); 4759 Instruction *Update = cast<Instruction>( 4760 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4761 Worklist.insert(Update); 4762 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4763 << "\n"); 4764 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4765 << "\n"); 4766 return; 4767 } 4768 // We only care about bitcast and getelementptr instructions contained in 4769 // the loop. 4770 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4771 return; 4772 4773 // If the pointer has already been identified as scalar (e.g., if it was 4774 // also identified as uniform), there's nothing to do. 4775 auto *I = cast<Instruction>(Ptr); 4776 if (Worklist.count(I)) 4777 return; 4778 4779 // If the use of the pointer will be a scalar use, and all users of the 4780 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4781 // place the pointer in PossibleNonScalarPtrs. 4782 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4783 return isa<LoadInst>(U) || isa<StoreInst>(U); 4784 })) 4785 ScalarPtrs.insert(I); 4786 else 4787 PossibleNonScalarPtrs.insert(I); 4788 }; 4789 4790 // We seed the scalars analysis with three classes of instructions: (1) 4791 // instructions marked uniform-after-vectorization and (2) bitcast, 4792 // getelementptr and (pointer) phi instructions used by memory accesses 4793 // requiring a scalar use. 4794 // 4795 // (1) Add to the worklist all instructions that have been identified as 4796 // uniform-after-vectorization. 4797 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4798 4799 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4800 // memory accesses requiring a scalar use. The pointer operands of loads and 4801 // stores will be scalar as long as the memory accesses is not a gather or 4802 // scatter operation. The value operand of a store will remain scalar if the 4803 // store is scalarized. 4804 for (auto *BB : TheLoop->blocks()) 4805 for (auto &I : *BB) { 4806 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4807 evaluatePtrUse(Load, Load->getPointerOperand()); 4808 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4809 evaluatePtrUse(Store, Store->getPointerOperand()); 4810 evaluatePtrUse(Store, Store->getValueOperand()); 4811 } 4812 } 4813 for (auto *I : ScalarPtrs) 4814 if (!PossibleNonScalarPtrs.count(I)) { 4815 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4816 Worklist.insert(I); 4817 } 4818 4819 // Insert the forced scalars. 4820 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4821 // induction variable when the PHI user is scalarized. 4822 auto ForcedScalar = ForcedScalars.find(VF); 4823 if (ForcedScalar != ForcedScalars.end()) 4824 for (auto *I : ForcedScalar->second) 4825 Worklist.insert(I); 4826 4827 // Expand the worklist by looking through any bitcasts and getelementptr 4828 // instructions we've already identified as scalar. This is similar to the 4829 // expansion step in collectLoopUniforms(); however, here we're only 4830 // expanding to include additional bitcasts and getelementptr instructions. 4831 unsigned Idx = 0; 4832 while (Idx != Worklist.size()) { 4833 Instruction *Dst = Worklist[Idx++]; 4834 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4835 continue; 4836 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4837 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4838 auto *J = cast<Instruction>(U); 4839 return !TheLoop->contains(J) || Worklist.count(J) || 4840 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4841 isScalarUse(J, Src)); 4842 })) { 4843 Worklist.insert(Src); 4844 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4845 } 4846 } 4847 4848 // An induction variable will remain scalar if all users of the induction 4849 // variable and induction variable update remain scalar. 4850 for (auto &Induction : Legal->getInductionVars()) { 4851 auto *Ind = Induction.first; 4852 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4853 4854 // If tail-folding is applied, the primary induction variable will be used 4855 // to feed a vector compare. 4856 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4857 continue; 4858 4859 // Determine if all users of the induction variable are scalar after 4860 // vectorization. 4861 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4862 auto *I = cast<Instruction>(U); 4863 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4864 }); 4865 if (!ScalarInd) 4866 continue; 4867 4868 // Determine if all users of the induction variable update instruction are 4869 // scalar after vectorization. 4870 auto ScalarIndUpdate = 4871 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4872 auto *I = cast<Instruction>(U); 4873 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4874 }); 4875 if (!ScalarIndUpdate) 4876 continue; 4877 4878 // The induction variable and its update instruction will remain scalar. 4879 Worklist.insert(Ind); 4880 Worklist.insert(IndUpdate); 4881 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4882 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4883 << "\n"); 4884 } 4885 4886 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4887 } 4888 4889 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4890 ElementCount VF) { 4891 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4892 if (!blockNeedsPredication(I->getParent())) 4893 return false; 4894 switch(I->getOpcode()) { 4895 default: 4896 break; 4897 case Instruction::Load: 4898 case Instruction::Store: { 4899 if (!Legal->isMaskRequired(I)) 4900 return false; 4901 auto *Ptr = getLoadStorePointerOperand(I); 4902 auto *Ty = getMemInstValueType(I); 4903 // We have already decided how to vectorize this instruction, get that 4904 // result. 4905 if (VF.isVector()) { 4906 InstWidening WideningDecision = getWideningDecision(I, VF); 4907 assert(WideningDecision != CM_Unknown && 4908 "Widening decision should be ready at this moment"); 4909 return WideningDecision == CM_Scalarize; 4910 } 4911 const Align Alignment = getLoadStoreAlignment(I); 4912 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4913 isLegalMaskedGather(Ty, Alignment)) 4914 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4915 isLegalMaskedScatter(Ty, Alignment)); 4916 } 4917 case Instruction::UDiv: 4918 case Instruction::SDiv: 4919 case Instruction::SRem: 4920 case Instruction::URem: 4921 return mayDivideByZero(*I); 4922 } 4923 return false; 4924 } 4925 4926 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4927 Instruction *I, ElementCount VF) { 4928 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4929 assert(getWideningDecision(I, VF) == CM_Unknown && 4930 "Decision should not be set yet."); 4931 auto *Group = getInterleavedAccessGroup(I); 4932 assert(Group && "Must have a group."); 4933 4934 // If the instruction's allocated size doesn't equal it's type size, it 4935 // requires padding and will be scalarized. 4936 auto &DL = I->getModule()->getDataLayout(); 4937 auto *ScalarTy = getMemInstValueType(I); 4938 if (hasIrregularType(ScalarTy, DL, VF)) 4939 return false; 4940 4941 // Check if masking is required. 4942 // A Group may need masking for one of two reasons: it resides in a block that 4943 // needs predication, or it was decided to use masking to deal with gaps. 4944 bool PredicatedAccessRequiresMasking = 4945 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4946 bool AccessWithGapsRequiresMasking = 4947 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4948 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4949 return true; 4950 4951 // If masked interleaving is required, we expect that the user/target had 4952 // enabled it, because otherwise it either wouldn't have been created or 4953 // it should have been invalidated by the CostModel. 4954 assert(useMaskedInterleavedAccesses(TTI) && 4955 "Masked interleave-groups for predicated accesses are not enabled."); 4956 4957 auto *Ty = getMemInstValueType(I); 4958 const Align Alignment = getLoadStoreAlignment(I); 4959 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4960 : TTI.isLegalMaskedStore(Ty, Alignment); 4961 } 4962 4963 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4964 Instruction *I, ElementCount VF) { 4965 // Get and ensure we have a valid memory instruction. 4966 LoadInst *LI = dyn_cast<LoadInst>(I); 4967 StoreInst *SI = dyn_cast<StoreInst>(I); 4968 assert((LI || SI) && "Invalid memory instruction"); 4969 4970 auto *Ptr = getLoadStorePointerOperand(I); 4971 4972 // In order to be widened, the pointer should be consecutive, first of all. 4973 if (!Legal->isConsecutivePtr(Ptr)) 4974 return false; 4975 4976 // If the instruction is a store located in a predicated block, it will be 4977 // scalarized. 4978 if (isScalarWithPredication(I)) 4979 return false; 4980 4981 // If the instruction's allocated size doesn't equal it's type size, it 4982 // requires padding and will be scalarized. 4983 auto &DL = I->getModule()->getDataLayout(); 4984 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4985 if (hasIrregularType(ScalarTy, DL, VF)) 4986 return false; 4987 4988 return true; 4989 } 4990 4991 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4992 // We should not collect Uniforms more than once per VF. Right now, 4993 // this function is called from collectUniformsAndScalars(), which 4994 // already does this check. Collecting Uniforms for VF=1 does not make any 4995 // sense. 4996 4997 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4998 "This function should not be visited twice for the same VF"); 4999 5000 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5001 // not analyze again. Uniforms.count(VF) will return 1. 5002 Uniforms[VF].clear(); 5003 5004 // We now know that the loop is vectorizable! 5005 // Collect instructions inside the loop that will remain uniform after 5006 // vectorization. 5007 5008 // Global values, params and instructions outside of current loop are out of 5009 // scope. 5010 auto isOutOfScope = [&](Value *V) -> bool { 5011 Instruction *I = dyn_cast<Instruction>(V); 5012 return (!I || !TheLoop->contains(I)); 5013 }; 5014 5015 SetVector<Instruction *> Worklist; 5016 BasicBlock *Latch = TheLoop->getLoopLatch(); 5017 5018 // Instructions that are scalar with predication must not be considered 5019 // uniform after vectorization, because that would create an erroneous 5020 // replicating region where only a single instance out of VF should be formed. 5021 // TODO: optimize such seldom cases if found important, see PR40816. 5022 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5023 if (isScalarWithPredication(I, VF)) { 5024 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5025 << *I << "\n"); 5026 return; 5027 } 5028 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5029 Worklist.insert(I); 5030 }; 5031 5032 // Start with the conditional branch. If the branch condition is an 5033 // instruction contained in the loop that is only used by the branch, it is 5034 // uniform. 5035 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5036 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5037 addToWorklistIfAllowed(Cmp); 5038 5039 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5040 // are pointers that are treated like consecutive pointers during 5041 // vectorization. The pointer operands of interleaved accesses are an 5042 // example. 5043 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5044 5045 // Holds pointer operands of instructions that are possibly non-uniform. 5046 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5047 5048 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5049 InstWidening WideningDecision = getWideningDecision(I, VF); 5050 assert(WideningDecision != CM_Unknown && 5051 "Widening decision should be ready at this moment"); 5052 5053 return (WideningDecision == CM_Widen || 5054 WideningDecision == CM_Widen_Reverse || 5055 WideningDecision == CM_Interleave); 5056 }; 5057 // Iterate over the instructions in the loop, and collect all 5058 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5059 // that a consecutive-like pointer operand will be scalarized, we collect it 5060 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5061 // getelementptr instruction can be used by both vectorized and scalarized 5062 // memory instructions. For example, if a loop loads and stores from the same 5063 // location, but the store is conditional, the store will be scalarized, and 5064 // the getelementptr won't remain uniform. 5065 for (auto *BB : TheLoop->blocks()) 5066 for (auto &I : *BB) { 5067 // If there's no pointer operand, there's nothing to do. 5068 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5069 if (!Ptr) 5070 continue; 5071 5072 // True if all users of Ptr are memory accesses that have Ptr as their 5073 // pointer operand. 5074 auto UsersAreMemAccesses = 5075 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5076 return getLoadStorePointerOperand(U) == Ptr; 5077 }); 5078 5079 // Ensure the memory instruction will not be scalarized or used by 5080 // gather/scatter, making its pointer operand non-uniform. If the pointer 5081 // operand is used by any instruction other than a memory access, we 5082 // conservatively assume the pointer operand may be non-uniform. 5083 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5084 PossibleNonUniformPtrs.insert(Ptr); 5085 5086 // If the memory instruction will be vectorized and its pointer operand 5087 // is consecutive-like, or interleaving - the pointer operand should 5088 // remain uniform. 5089 else 5090 ConsecutiveLikePtrs.insert(Ptr); 5091 } 5092 5093 // Add to the Worklist all consecutive and consecutive-like pointers that 5094 // aren't also identified as possibly non-uniform. 5095 for (auto *V : ConsecutiveLikePtrs) 5096 if (!PossibleNonUniformPtrs.count(V)) 5097 addToWorklistIfAllowed(V); 5098 5099 // Expand Worklist in topological order: whenever a new instruction 5100 // is added , its users should be already inside Worklist. It ensures 5101 // a uniform instruction will only be used by uniform instructions. 5102 unsigned idx = 0; 5103 while (idx != Worklist.size()) { 5104 Instruction *I = Worklist[idx++]; 5105 5106 for (auto OV : I->operand_values()) { 5107 // isOutOfScope operands cannot be uniform instructions. 5108 if (isOutOfScope(OV)) 5109 continue; 5110 // First order recurrence Phi's should typically be considered 5111 // non-uniform. 5112 auto *OP = dyn_cast<PHINode>(OV); 5113 if (OP && Legal->isFirstOrderRecurrence(OP)) 5114 continue; 5115 // If all the users of the operand are uniform, then add the 5116 // operand into the uniform worklist. 5117 auto *OI = cast<Instruction>(OV); 5118 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5119 auto *J = cast<Instruction>(U); 5120 return Worklist.count(J) || 5121 (OI == getLoadStorePointerOperand(J) && 5122 isUniformDecision(J, VF)); 5123 })) 5124 addToWorklistIfAllowed(OI); 5125 } 5126 } 5127 5128 // Returns true if Ptr is the pointer operand of a memory access instruction 5129 // I, and I is known to not require scalarization. 5130 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5131 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5132 }; 5133 5134 // For an instruction to be added into Worklist above, all its users inside 5135 // the loop should also be in Worklist. However, this condition cannot be 5136 // true for phi nodes that form a cyclic dependence. We must process phi 5137 // nodes separately. An induction variable will remain uniform if all users 5138 // of the induction variable and induction variable update remain uniform. 5139 // The code below handles both pointer and non-pointer induction variables. 5140 for (auto &Induction : Legal->getInductionVars()) { 5141 auto *Ind = Induction.first; 5142 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5143 5144 // Determine if all users of the induction variable are uniform after 5145 // vectorization. 5146 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5147 auto *I = cast<Instruction>(U); 5148 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5149 isVectorizedMemAccessUse(I, Ind); 5150 }); 5151 if (!UniformInd) 5152 continue; 5153 5154 // Determine if all users of the induction variable update instruction are 5155 // uniform after vectorization. 5156 auto UniformIndUpdate = 5157 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5158 auto *I = cast<Instruction>(U); 5159 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5160 isVectorizedMemAccessUse(I, IndUpdate); 5161 }); 5162 if (!UniformIndUpdate) 5163 continue; 5164 5165 // The induction variable and its update instruction will remain uniform. 5166 addToWorklistIfAllowed(Ind); 5167 addToWorklistIfAllowed(IndUpdate); 5168 } 5169 5170 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5171 } 5172 5173 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5174 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5175 5176 if (Legal->getRuntimePointerChecking()->Need) { 5177 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5178 "runtime pointer checks needed. Enable vectorization of this " 5179 "loop with '#pragma clang loop vectorize(enable)' when " 5180 "compiling with -Os/-Oz", 5181 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5182 return true; 5183 } 5184 5185 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5186 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5187 "runtime SCEV checks needed. Enable vectorization of this " 5188 "loop with '#pragma clang loop vectorize(enable)' when " 5189 "compiling with -Os/-Oz", 5190 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5191 return true; 5192 } 5193 5194 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5195 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5196 reportVectorizationFailure("Runtime stride check for small trip count", 5197 "runtime stride == 1 checks needed. Enable vectorization of " 5198 "this loop without such check by compiling with -Os/-Oz", 5199 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5200 return true; 5201 } 5202 5203 return false; 5204 } 5205 5206 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5207 unsigned UserIC) { 5208 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5209 // TODO: It may by useful to do since it's still likely to be dynamically 5210 // uniform if the target can skip. 5211 reportVectorizationFailure( 5212 "Not inserting runtime ptr check for divergent target", 5213 "runtime pointer checks needed. Not enabled for divergent target", 5214 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5215 return None; 5216 } 5217 5218 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5219 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5220 if (TC == 1) { 5221 reportVectorizationFailure("Single iteration (non) loop", 5222 "loop trip count is one, irrelevant for vectorization", 5223 "SingleIterationLoop", ORE, TheLoop); 5224 return None; 5225 } 5226 5227 switch (ScalarEpilogueStatus) { 5228 case CM_ScalarEpilogueAllowed: 5229 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5230 case CM_ScalarEpilogueNotNeededUsePredicate: 5231 LLVM_DEBUG( 5232 dbgs() << "LV: vector predicate hint/switch found.\n" 5233 << "LV: Not allowing scalar epilogue, creating predicated " 5234 << "vector loop.\n"); 5235 break; 5236 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5237 // fallthrough as a special case of OptForSize 5238 case CM_ScalarEpilogueNotAllowedOptSize: 5239 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5240 LLVM_DEBUG( 5241 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5242 else 5243 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5244 << "count.\n"); 5245 5246 // Bail if runtime checks are required, which are not good when optimising 5247 // for size. 5248 if (runtimeChecksRequired()) 5249 return None; 5250 break; 5251 } 5252 5253 // Now try the tail folding 5254 5255 // Invalidate interleave groups that require an epilogue if we can't mask 5256 // the interleave-group. 5257 if (!useMaskedInterleavedAccesses(TTI)) { 5258 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5259 "No decisions should have been taken at this point"); 5260 // Note: There is no need to invalidate any cost modeling decisions here, as 5261 // non where taken so far. 5262 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5263 } 5264 5265 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5266 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5267 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5268 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5269 // Accept MaxVF if we do not have a tail. 5270 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5271 return MaxVF; 5272 } 5273 5274 // If we don't know the precise trip count, or if the trip count that we 5275 // found modulo the vectorization factor is not zero, try to fold the tail 5276 // by masking. 5277 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5278 if (Legal->prepareToFoldTailByMasking()) { 5279 FoldTailByMasking = true; 5280 return MaxVF; 5281 } 5282 5283 // If there was a tail-folding hint/switch, but we can't fold the tail by 5284 // masking, fallback to a vectorization with a scalar epilogue. 5285 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5286 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5287 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5288 return None; 5289 } 5290 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5291 "scalar epilogue instead.\n"); 5292 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5293 return MaxVF; 5294 } 5295 5296 if (TC == 0) { 5297 reportVectorizationFailure( 5298 "Unable to calculate the loop count due to complex control flow", 5299 "unable to calculate the loop count due to complex control flow", 5300 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5301 return None; 5302 } 5303 5304 reportVectorizationFailure( 5305 "Cannot optimize for size and vectorize at the same time.", 5306 "cannot optimize for size and vectorize at the same time. " 5307 "Enable vectorization of this loop with '#pragma clang loop " 5308 "vectorize(enable)' when compiling with -Os/-Oz", 5309 "NoTailLoopWithOptForSize", ORE, TheLoop); 5310 return None; 5311 } 5312 5313 unsigned 5314 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5315 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5316 unsigned SmallestType, WidestType; 5317 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5318 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5319 5320 // Get the maximum safe dependence distance in bits computed by LAA. 5321 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5322 // the memory accesses that is most restrictive (involved in the smallest 5323 // dependence distance). 5324 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5325 5326 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5327 5328 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5329 // Note that both WidestRegister and WidestType may not be a powers of 2. 5330 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5331 5332 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5333 << " / " << WidestType << " bits.\n"); 5334 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5335 << WidestRegister << " bits.\n"); 5336 5337 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5338 " into one vector!"); 5339 if (MaxVectorSize == 0) { 5340 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5341 MaxVectorSize = 1; 5342 return MaxVectorSize; 5343 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5344 isPowerOf2_32(ConstTripCount)) { 5345 // We need to clamp the VF to be the ConstTripCount. There is no point in 5346 // choosing a higher viable VF as done in the loop below. 5347 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5348 << ConstTripCount << "\n"); 5349 MaxVectorSize = ConstTripCount; 5350 return MaxVectorSize; 5351 } 5352 5353 unsigned MaxVF = MaxVectorSize; 5354 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5355 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5356 // Collect all viable vectorization factors larger than the default MaxVF 5357 // (i.e. MaxVectorSize). 5358 SmallVector<ElementCount, 8> VFs; 5359 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5360 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5361 VFs.push_back(ElementCount::getFixed(VS)); 5362 5363 // For each VF calculate its register usage. 5364 auto RUs = calculateRegisterUsage(VFs); 5365 5366 // Select the largest VF which doesn't require more registers than existing 5367 // ones. 5368 for (int i = RUs.size() - 1; i >= 0; --i) { 5369 bool Selected = true; 5370 for (auto& pair : RUs[i].MaxLocalUsers) { 5371 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5372 if (pair.second > TargetNumRegisters) 5373 Selected = false; 5374 } 5375 if (Selected) { 5376 MaxVF = VFs[i].getKnownMinValue(); 5377 break; 5378 } 5379 } 5380 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5381 if (MaxVF < MinVF) { 5382 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5383 << ") with target's minimum: " << MinVF << '\n'); 5384 MaxVF = MinVF; 5385 } 5386 } 5387 } 5388 return MaxVF; 5389 } 5390 5391 VectorizationFactor 5392 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5393 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5394 const float ScalarCost = Cost; 5395 unsigned Width = 1; 5396 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5397 5398 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5399 if (ForceVectorization && MaxVF > 1) { 5400 // Ignore scalar width, because the user explicitly wants vectorization. 5401 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5402 // evaluation. 5403 Cost = std::numeric_limits<float>::max(); 5404 } 5405 5406 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5407 // Notice that the vector loop needs to be executed less times, so 5408 // we need to divide the cost of the vector loops by the width of 5409 // the vector elements. 5410 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5411 float VectorCost = C.first / (float)i; 5412 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5413 << " costs: " << (int)VectorCost << ".\n"); 5414 if (!C.second && !ForceVectorization) { 5415 LLVM_DEBUG( 5416 dbgs() << "LV: Not considering vector loop of width " << i 5417 << " because it will not generate any vector instructions.\n"); 5418 continue; 5419 } 5420 if (VectorCost < Cost) { 5421 Cost = VectorCost; 5422 Width = i; 5423 } 5424 } 5425 5426 if (!EnableCondStoresVectorization && NumPredStores) { 5427 reportVectorizationFailure("There are conditional stores.", 5428 "store that is conditionally executed prevents vectorization", 5429 "ConditionalStore", ORE, TheLoop); 5430 Width = 1; 5431 Cost = ScalarCost; 5432 } 5433 5434 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5435 << "LV: Vectorization seems to be not beneficial, " 5436 << "but was forced by a user.\n"); 5437 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5438 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5439 (unsigned)(Width * Cost)}; 5440 return Factor; 5441 } 5442 5443 std::pair<unsigned, unsigned> 5444 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5445 unsigned MinWidth = -1U; 5446 unsigned MaxWidth = 8; 5447 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5448 5449 // For each block. 5450 for (BasicBlock *BB : TheLoop->blocks()) { 5451 // For each instruction in the loop. 5452 for (Instruction &I : BB->instructionsWithoutDebug()) { 5453 Type *T = I.getType(); 5454 5455 // Skip ignored values. 5456 if (ValuesToIgnore.count(&I)) 5457 continue; 5458 5459 // Only examine Loads, Stores and PHINodes. 5460 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5461 continue; 5462 5463 // Examine PHI nodes that are reduction variables. Update the type to 5464 // account for the recurrence type. 5465 if (auto *PN = dyn_cast<PHINode>(&I)) { 5466 if (!Legal->isReductionVariable(PN)) 5467 continue; 5468 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5469 T = RdxDesc.getRecurrenceType(); 5470 } 5471 5472 // Examine the stored values. 5473 if (auto *ST = dyn_cast<StoreInst>(&I)) 5474 T = ST->getValueOperand()->getType(); 5475 5476 // Ignore loaded pointer types and stored pointer types that are not 5477 // vectorizable. 5478 // 5479 // FIXME: The check here attempts to predict whether a load or store will 5480 // be vectorized. We only know this for certain after a VF has 5481 // been selected. Here, we assume that if an access can be 5482 // vectorized, it will be. We should also look at extending this 5483 // optimization to non-pointer types. 5484 // 5485 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5486 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5487 continue; 5488 5489 MinWidth = std::min(MinWidth, 5490 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5491 MaxWidth = std::max(MaxWidth, 5492 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5493 } 5494 } 5495 5496 return {MinWidth, MaxWidth}; 5497 } 5498 5499 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5500 unsigned LoopCost) { 5501 // -- The interleave heuristics -- 5502 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5503 // There are many micro-architectural considerations that we can't predict 5504 // at this level. For example, frontend pressure (on decode or fetch) due to 5505 // code size, or the number and capabilities of the execution ports. 5506 // 5507 // We use the following heuristics to select the interleave count: 5508 // 1. If the code has reductions, then we interleave to break the cross 5509 // iteration dependency. 5510 // 2. If the loop is really small, then we interleave to reduce the loop 5511 // overhead. 5512 // 3. We don't interleave if we think that we will spill registers to memory 5513 // due to the increased register pressure. 5514 5515 if (!isScalarEpilogueAllowed()) 5516 return 1; 5517 5518 // We used the distance for the interleave count. 5519 if (Legal->getMaxSafeDepDistBytes() != -1U) 5520 return 1; 5521 5522 // Do not interleave loops with a relatively small known or estimated trip 5523 // count. 5524 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5525 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5526 return 1; 5527 5528 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5529 // We divide by these constants so assume that we have at least one 5530 // instruction that uses at least one register. 5531 for (auto& pair : R.MaxLocalUsers) { 5532 pair.second = std::max(pair.second, 1U); 5533 } 5534 5535 // We calculate the interleave count using the following formula. 5536 // Subtract the number of loop invariants from the number of available 5537 // registers. These registers are used by all of the interleaved instances. 5538 // Next, divide the remaining registers by the number of registers that is 5539 // required by the loop, in order to estimate how many parallel instances 5540 // fit without causing spills. All of this is rounded down if necessary to be 5541 // a power of two. We want power of two interleave count to simplify any 5542 // addressing operations or alignment considerations. 5543 // We also want power of two interleave counts to ensure that the induction 5544 // variable of the vector loop wraps to zero, when tail is folded by masking; 5545 // this currently happens when OptForSize, in which case IC is set to 1 above. 5546 unsigned IC = UINT_MAX; 5547 5548 for (auto& pair : R.MaxLocalUsers) { 5549 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5550 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5551 << " registers of " 5552 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5553 if (VF == 1) { 5554 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5555 TargetNumRegisters = ForceTargetNumScalarRegs; 5556 } else { 5557 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5558 TargetNumRegisters = ForceTargetNumVectorRegs; 5559 } 5560 unsigned MaxLocalUsers = pair.second; 5561 unsigned LoopInvariantRegs = 0; 5562 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5563 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5564 5565 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5566 // Don't count the induction variable as interleaved. 5567 if (EnableIndVarRegisterHeur) { 5568 TmpIC = 5569 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5570 std::max(1U, (MaxLocalUsers - 1))); 5571 } 5572 5573 IC = std::min(IC, TmpIC); 5574 } 5575 5576 // Clamp the interleave ranges to reasonable counts. 5577 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5578 unsigned MaxInterleaveCount = 5579 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5580 5581 // Check if the user has overridden the max. 5582 if (VF == 1) { 5583 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5584 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5585 } else { 5586 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5587 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5588 } 5589 5590 // If trip count is known or estimated compile time constant, limit the 5591 // interleave count to be less than the trip count divided by VF. 5592 if (BestKnownTC) { 5593 MaxInterleaveCount = 5594 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5595 } 5596 5597 // If we did not calculate the cost for VF (because the user selected the VF) 5598 // then we calculate the cost of VF here. 5599 if (LoopCost == 0) 5600 LoopCost = expectedCost(VF).first; 5601 5602 assert(LoopCost && "Non-zero loop cost expected"); 5603 5604 // Clamp the calculated IC to be between the 1 and the max interleave count 5605 // that the target and trip count allows. 5606 if (IC > MaxInterleaveCount) 5607 IC = MaxInterleaveCount; 5608 else if (IC < 1) 5609 IC = 1; 5610 5611 // Interleave if we vectorized this loop and there is a reduction that could 5612 // benefit from interleaving. 5613 if (VF.isVector() && !Legal->getReductionVars().empty()) { 5614 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5615 return IC; 5616 } 5617 5618 // Note that if we've already vectorized the loop we will have done the 5619 // runtime check and so interleaving won't require further checks. 5620 bool InterleavingRequiresRuntimePointerCheck = 5621 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5622 5623 // We want to interleave small loops in order to reduce the loop overhead and 5624 // potentially expose ILP opportunities. 5625 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5626 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5627 // We assume that the cost overhead is 1 and we use the cost model 5628 // to estimate the cost of the loop and interleave until the cost of the 5629 // loop overhead is about 5% of the cost of the loop. 5630 unsigned SmallIC = 5631 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5632 5633 // Interleave until store/load ports (estimated by max interleave count) are 5634 // saturated. 5635 unsigned NumStores = Legal->getNumStores(); 5636 unsigned NumLoads = Legal->getNumLoads(); 5637 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5638 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5639 5640 // If we have a scalar reduction (vector reductions are already dealt with 5641 // by this point), we can increase the critical path length if the loop 5642 // we're interleaving is inside another loop. Limit, by default to 2, so the 5643 // critical path only gets increased by one reduction operation. 5644 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5645 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5646 SmallIC = std::min(SmallIC, F); 5647 StoresIC = std::min(StoresIC, F); 5648 LoadsIC = std::min(LoadsIC, F); 5649 } 5650 5651 if (EnableLoadStoreRuntimeInterleave && 5652 std::max(StoresIC, LoadsIC) > SmallIC) { 5653 LLVM_DEBUG( 5654 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5655 return std::max(StoresIC, LoadsIC); 5656 } 5657 5658 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5659 return SmallIC; 5660 } 5661 5662 // Interleave if this is a large loop (small loops are already dealt with by 5663 // this point) that could benefit from interleaving. 5664 bool HasReductions = !Legal->getReductionVars().empty(); 5665 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5666 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5667 return IC; 5668 } 5669 5670 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5671 return 1; 5672 } 5673 5674 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5675 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5676 // This function calculates the register usage by measuring the highest number 5677 // of values that are alive at a single location. Obviously, this is a very 5678 // rough estimation. We scan the loop in a topological order in order and 5679 // assign a number to each instruction. We use RPO to ensure that defs are 5680 // met before their users. We assume that each instruction that has in-loop 5681 // users starts an interval. We record every time that an in-loop value is 5682 // used, so we have a list of the first and last occurrences of each 5683 // instruction. Next, we transpose this data structure into a multi map that 5684 // holds the list of intervals that *end* at a specific location. This multi 5685 // map allows us to perform a linear search. We scan the instructions linearly 5686 // and record each time that a new interval starts, by placing it in a set. 5687 // If we find this value in the multi-map then we remove it from the set. 5688 // The max register usage is the maximum size of the set. 5689 // We also search for instructions that are defined outside the loop, but are 5690 // used inside the loop. We need this number separately from the max-interval 5691 // usage number because when we unroll, loop-invariant values do not take 5692 // more register. 5693 LoopBlocksDFS DFS(TheLoop); 5694 DFS.perform(LI); 5695 5696 RegisterUsage RU; 5697 5698 // Each 'key' in the map opens a new interval. The values 5699 // of the map are the index of the 'last seen' usage of the 5700 // instruction that is the key. 5701 using IntervalMap = DenseMap<Instruction *, unsigned>; 5702 5703 // Maps instruction to its index. 5704 SmallVector<Instruction *, 64> IdxToInstr; 5705 // Marks the end of each interval. 5706 IntervalMap EndPoint; 5707 // Saves the list of instruction indices that are used in the loop. 5708 SmallPtrSet<Instruction *, 8> Ends; 5709 // Saves the list of values that are used in the loop but are 5710 // defined outside the loop, such as arguments and constants. 5711 SmallPtrSet<Value *, 8> LoopInvariants; 5712 5713 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5714 for (Instruction &I : BB->instructionsWithoutDebug()) { 5715 IdxToInstr.push_back(&I); 5716 5717 // Save the end location of each USE. 5718 for (Value *U : I.operands()) { 5719 auto *Instr = dyn_cast<Instruction>(U); 5720 5721 // Ignore non-instruction values such as arguments, constants, etc. 5722 if (!Instr) 5723 continue; 5724 5725 // If this instruction is outside the loop then record it and continue. 5726 if (!TheLoop->contains(Instr)) { 5727 LoopInvariants.insert(Instr); 5728 continue; 5729 } 5730 5731 // Overwrite previous end points. 5732 EndPoint[Instr] = IdxToInstr.size(); 5733 Ends.insert(Instr); 5734 } 5735 } 5736 } 5737 5738 // Saves the list of intervals that end with the index in 'key'. 5739 using InstrList = SmallVector<Instruction *, 2>; 5740 DenseMap<unsigned, InstrList> TransposeEnds; 5741 5742 // Transpose the EndPoints to a list of values that end at each index. 5743 for (auto &Interval : EndPoint) 5744 TransposeEnds[Interval.second].push_back(Interval.first); 5745 5746 SmallPtrSet<Instruction *, 8> OpenIntervals; 5747 5748 // Get the size of the widest register. 5749 unsigned MaxSafeDepDist = -1U; 5750 if (Legal->getMaxSafeDepDistBytes() != -1U) 5751 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5752 unsigned WidestRegister = 5753 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5754 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5755 5756 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5757 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5758 5759 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5760 5761 // A lambda that gets the register usage for the given type and VF. 5762 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5763 if (Ty->isTokenTy()) 5764 return 0U; 5765 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5766 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5767 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5768 WidestRegister); 5769 }; 5770 5771 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5772 Instruction *I = IdxToInstr[i]; 5773 5774 // Remove all of the instructions that end at this location. 5775 InstrList &List = TransposeEnds[i]; 5776 for (Instruction *ToRemove : List) 5777 OpenIntervals.erase(ToRemove); 5778 5779 // Ignore instructions that are never used within the loop. 5780 if (!Ends.count(I)) 5781 continue; 5782 5783 // Skip ignored values. 5784 if (ValuesToIgnore.count(I)) 5785 continue; 5786 5787 // For each VF find the maximum usage of registers. 5788 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5789 // Count the number of live intervals. 5790 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5791 5792 if (VFs[j].isScalar()) { 5793 for (auto Inst : OpenIntervals) { 5794 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5795 if (RegUsage.find(ClassID) == RegUsage.end()) 5796 RegUsage[ClassID] = 1; 5797 else 5798 RegUsage[ClassID] += 1; 5799 } 5800 } else { 5801 collectUniformsAndScalars(VFs[j]); 5802 for (auto Inst : OpenIntervals) { 5803 // Skip ignored values for VF > 1. 5804 if (VecValuesToIgnore.count(Inst)) 5805 continue; 5806 if (isScalarAfterVectorization(Inst, VFs[j])) { 5807 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5808 if (RegUsage.find(ClassID) == RegUsage.end()) 5809 RegUsage[ClassID] = 1; 5810 else 5811 RegUsage[ClassID] += 1; 5812 } else { 5813 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5814 if (RegUsage.find(ClassID) == RegUsage.end()) 5815 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5816 else 5817 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5818 } 5819 } 5820 } 5821 5822 for (auto& pair : RegUsage) { 5823 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5824 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5825 else 5826 MaxUsages[j][pair.first] = pair.second; 5827 } 5828 } 5829 5830 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5831 << OpenIntervals.size() << '\n'); 5832 5833 // Add the current instruction to the list of open intervals. 5834 OpenIntervals.insert(I); 5835 } 5836 5837 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5838 SmallMapVector<unsigned, unsigned, 4> Invariant; 5839 5840 for (auto Inst : LoopInvariants) { 5841 unsigned Usage = 5842 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5843 unsigned ClassID = 5844 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5845 if (Invariant.find(ClassID) == Invariant.end()) 5846 Invariant[ClassID] = Usage; 5847 else 5848 Invariant[ClassID] += Usage; 5849 } 5850 5851 LLVM_DEBUG({ 5852 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5853 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5854 << " item\n"; 5855 for (const auto &pair : MaxUsages[i]) { 5856 dbgs() << "LV(REG): RegisterClass: " 5857 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5858 << " registers\n"; 5859 } 5860 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5861 << " item\n"; 5862 for (const auto &pair : Invariant) { 5863 dbgs() << "LV(REG): RegisterClass: " 5864 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5865 << " registers\n"; 5866 } 5867 }); 5868 5869 RU.LoopInvariantRegs = Invariant; 5870 RU.MaxLocalUsers = MaxUsages[i]; 5871 RUs[i] = RU; 5872 } 5873 5874 return RUs; 5875 } 5876 5877 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5878 // TODO: Cost model for emulated masked load/store is completely 5879 // broken. This hack guides the cost model to use an artificially 5880 // high enough value to practically disable vectorization with such 5881 // operations, except where previously deployed legality hack allowed 5882 // using very low cost values. This is to avoid regressions coming simply 5883 // from moving "masked load/store" check from legality to cost model. 5884 // Masked Load/Gather emulation was previously never allowed. 5885 // Limited number of Masked Store/Scatter emulation was allowed. 5886 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5887 return isa<LoadInst>(I) || 5888 (isa<StoreInst>(I) && 5889 NumPredStores > NumberOfStoresToPredicate); 5890 } 5891 5892 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5893 // If we aren't vectorizing the loop, or if we've already collected the 5894 // instructions to scalarize, there's nothing to do. Collection may already 5895 // have occurred if we have a user-selected VF and are now computing the 5896 // expected cost for interleaving. 5897 if (VF.isScalar() || VF.isZero() || 5898 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5899 return; 5900 5901 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5902 // not profitable to scalarize any instructions, the presence of VF in the 5903 // map will indicate that we've analyzed it already. 5904 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5905 5906 // Find all the instructions that are scalar with predication in the loop and 5907 // determine if it would be better to not if-convert the blocks they are in. 5908 // If so, we also record the instructions to scalarize. 5909 for (BasicBlock *BB : TheLoop->blocks()) { 5910 if (!blockNeedsPredication(BB)) 5911 continue; 5912 for (Instruction &I : *BB) 5913 if (isScalarWithPredication(&I)) { 5914 ScalarCostsTy ScalarCosts; 5915 // Do not apply discount logic if hacked cost is needed 5916 // for emulated masked memrefs. 5917 if (!useEmulatedMaskMemRefHack(&I) && 5918 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5919 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5920 // Remember that BB will remain after vectorization. 5921 PredicatedBBsAfterVectorization.insert(BB); 5922 } 5923 } 5924 } 5925 5926 int LoopVectorizationCostModel::computePredInstDiscount( 5927 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5928 ElementCount VF) { 5929 assert(!isUniformAfterVectorization(PredInst, VF) && 5930 "Instruction marked uniform-after-vectorization will be predicated"); 5931 5932 // Initialize the discount to zero, meaning that the scalar version and the 5933 // vector version cost the same. 5934 int Discount = 0; 5935 5936 // Holds instructions to analyze. The instructions we visit are mapped in 5937 // ScalarCosts. Those instructions are the ones that would be scalarized if 5938 // we find that the scalar version costs less. 5939 SmallVector<Instruction *, 8> Worklist; 5940 5941 // Returns true if the given instruction can be scalarized. 5942 auto canBeScalarized = [&](Instruction *I) -> bool { 5943 // We only attempt to scalarize instructions forming a single-use chain 5944 // from the original predicated block that would otherwise be vectorized. 5945 // Although not strictly necessary, we give up on instructions we know will 5946 // already be scalar to avoid traversing chains that are unlikely to be 5947 // beneficial. 5948 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5949 isScalarAfterVectorization(I, VF)) 5950 return false; 5951 5952 // If the instruction is scalar with predication, it will be analyzed 5953 // separately. We ignore it within the context of PredInst. 5954 if (isScalarWithPredication(I)) 5955 return false; 5956 5957 // If any of the instruction's operands are uniform after vectorization, 5958 // the instruction cannot be scalarized. This prevents, for example, a 5959 // masked load from being scalarized. 5960 // 5961 // We assume we will only emit a value for lane zero of an instruction 5962 // marked uniform after vectorization, rather than VF identical values. 5963 // Thus, if we scalarize an instruction that uses a uniform, we would 5964 // create uses of values corresponding to the lanes we aren't emitting code 5965 // for. This behavior can be changed by allowing getScalarValue to clone 5966 // the lane zero values for uniforms rather than asserting. 5967 for (Use &U : I->operands()) 5968 if (auto *J = dyn_cast<Instruction>(U.get())) 5969 if (isUniformAfterVectorization(J, VF)) 5970 return false; 5971 5972 // Otherwise, we can scalarize the instruction. 5973 return true; 5974 }; 5975 5976 // Compute the expected cost discount from scalarizing the entire expression 5977 // feeding the predicated instruction. We currently only consider expressions 5978 // that are single-use instruction chains. 5979 Worklist.push_back(PredInst); 5980 while (!Worklist.empty()) { 5981 Instruction *I = Worklist.pop_back_val(); 5982 5983 // If we've already analyzed the instruction, there's nothing to do. 5984 if (ScalarCosts.find(I) != ScalarCosts.end()) 5985 continue; 5986 5987 // Compute the cost of the vector instruction. Note that this cost already 5988 // includes the scalarization overhead of the predicated instruction. 5989 unsigned VectorCost = getInstructionCost(I, VF).first; 5990 5991 // Compute the cost of the scalarized instruction. This cost is the cost of 5992 // the instruction as if it wasn't if-converted and instead remained in the 5993 // predicated block. We will scale this cost by block probability after 5994 // computing the scalarization overhead. 5995 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5996 unsigned ScalarCost = 5997 VF.getKnownMinValue() * 5998 getInstructionCost(I, ElementCount::getFixed(1)).first; 5999 6000 // Compute the scalarization overhead of needed insertelement instructions 6001 // and phi nodes. 6002 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6003 ScalarCost += TTI.getScalarizationOverhead( 6004 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6005 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6006 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6007 ScalarCost += 6008 VF.getKnownMinValue() * 6009 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6010 } 6011 6012 // Compute the scalarization overhead of needed extractelement 6013 // instructions. For each of the instruction's operands, if the operand can 6014 // be scalarized, add it to the worklist; otherwise, account for the 6015 // overhead. 6016 for (Use &U : I->operands()) 6017 if (auto *J = dyn_cast<Instruction>(U.get())) { 6018 assert(VectorType::isValidElementType(J->getType()) && 6019 "Instruction has non-scalar type"); 6020 if (canBeScalarized(J)) 6021 Worklist.push_back(J); 6022 else if (needsExtract(J, VF)) { 6023 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6024 ScalarCost += TTI.getScalarizationOverhead( 6025 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6026 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6027 } 6028 } 6029 6030 // Scale the total scalar cost by block probability. 6031 ScalarCost /= getReciprocalPredBlockProb(); 6032 6033 // Compute the discount. A non-negative discount means the vector version 6034 // of the instruction costs more, and scalarizing would be beneficial. 6035 Discount += VectorCost - ScalarCost; 6036 ScalarCosts[I] = ScalarCost; 6037 } 6038 6039 return Discount; 6040 } 6041 6042 LoopVectorizationCostModel::VectorizationCostTy 6043 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6044 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6045 VectorizationCostTy Cost; 6046 6047 // For each block. 6048 for (BasicBlock *BB : TheLoop->blocks()) { 6049 VectorizationCostTy BlockCost; 6050 6051 // For each instruction in the old loop. 6052 for (Instruction &I : BB->instructionsWithoutDebug()) { 6053 // Skip ignored values. 6054 if (ValuesToIgnore.count(&I) || 6055 (VF.isVector() && VecValuesToIgnore.count(&I))) 6056 continue; 6057 6058 VectorizationCostTy C = getInstructionCost(&I, VF); 6059 6060 // Check if we should override the cost. 6061 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6062 C.first = ForceTargetInstructionCost; 6063 6064 BlockCost.first += C.first; 6065 BlockCost.second |= C.second; 6066 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6067 << " for VF " << VF << " For instruction: " << I 6068 << '\n'); 6069 } 6070 6071 // If we are vectorizing a predicated block, it will have been 6072 // if-converted. This means that the block's instructions (aside from 6073 // stores and instructions that may divide by zero) will now be 6074 // unconditionally executed. For the scalar case, we may not always execute 6075 // the predicated block. Thus, scale the block's cost by the probability of 6076 // executing it. 6077 if (VF.isScalar() && blockNeedsPredication(BB)) 6078 BlockCost.first /= getReciprocalPredBlockProb(); 6079 6080 Cost.first += BlockCost.first; 6081 Cost.second |= BlockCost.second; 6082 } 6083 6084 return Cost; 6085 } 6086 6087 /// Gets Address Access SCEV after verifying that the access pattern 6088 /// is loop invariant except the induction variable dependence. 6089 /// 6090 /// This SCEV can be sent to the Target in order to estimate the address 6091 /// calculation cost. 6092 static const SCEV *getAddressAccessSCEV( 6093 Value *Ptr, 6094 LoopVectorizationLegality *Legal, 6095 PredicatedScalarEvolution &PSE, 6096 const Loop *TheLoop) { 6097 6098 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6099 if (!Gep) 6100 return nullptr; 6101 6102 // We are looking for a gep with all loop invariant indices except for one 6103 // which should be an induction variable. 6104 auto SE = PSE.getSE(); 6105 unsigned NumOperands = Gep->getNumOperands(); 6106 for (unsigned i = 1; i < NumOperands; ++i) { 6107 Value *Opd = Gep->getOperand(i); 6108 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6109 !Legal->isInductionVariable(Opd)) 6110 return nullptr; 6111 } 6112 6113 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6114 return PSE.getSCEV(Ptr); 6115 } 6116 6117 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6118 return Legal->hasStride(I->getOperand(0)) || 6119 Legal->hasStride(I->getOperand(1)); 6120 } 6121 6122 unsigned 6123 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6124 ElementCount VF) { 6125 assert(VF.isVector() && 6126 "Scalarization cost of instruction implies vectorization."); 6127 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6128 Type *ValTy = getMemInstValueType(I); 6129 auto SE = PSE.getSE(); 6130 6131 unsigned AS = getLoadStoreAddressSpace(I); 6132 Value *Ptr = getLoadStorePointerOperand(I); 6133 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6134 6135 // Figure out whether the access is strided and get the stride value 6136 // if it's known in compile time 6137 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6138 6139 // Get the cost of the scalar memory instruction and address computation. 6140 unsigned Cost = 6141 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6142 6143 // Don't pass *I here, since it is scalar but will actually be part of a 6144 // vectorized loop where the user of it is a vectorized instruction. 6145 const Align Alignment = getLoadStoreAlignment(I); 6146 Cost += VF.getKnownMinValue() * 6147 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6148 AS, TTI::TCK_RecipThroughput); 6149 6150 // Get the overhead of the extractelement and insertelement instructions 6151 // we might create due to scalarization. 6152 Cost += getScalarizationOverhead(I, VF); 6153 6154 // If we have a predicated store, it may not be executed for each vector 6155 // lane. Scale the cost by the probability of executing the predicated 6156 // block. 6157 if (isPredicatedInst(I)) { 6158 Cost /= getReciprocalPredBlockProb(); 6159 6160 if (useEmulatedMaskMemRefHack(I)) 6161 // Artificially setting to a high enough value to practically disable 6162 // vectorization with such operations. 6163 Cost = 3000000; 6164 } 6165 6166 return Cost; 6167 } 6168 6169 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6170 ElementCount VF) { 6171 Type *ValTy = getMemInstValueType(I); 6172 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6173 Value *Ptr = getLoadStorePointerOperand(I); 6174 unsigned AS = getLoadStoreAddressSpace(I); 6175 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6176 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6177 6178 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6179 "Stride should be 1 or -1 for consecutive memory access"); 6180 const Align Alignment = getLoadStoreAlignment(I); 6181 unsigned Cost = 0; 6182 if (Legal->isMaskRequired(I)) 6183 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6184 CostKind); 6185 else 6186 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6187 CostKind, I); 6188 6189 bool Reverse = ConsecutiveStride < 0; 6190 if (Reverse) 6191 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6192 return Cost; 6193 } 6194 6195 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6196 ElementCount VF) { 6197 Type *ValTy = getMemInstValueType(I); 6198 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6199 const Align Alignment = getLoadStoreAlignment(I); 6200 unsigned AS = getLoadStoreAddressSpace(I); 6201 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6202 if (isa<LoadInst>(I)) { 6203 return TTI.getAddressComputationCost(ValTy) + 6204 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6205 CostKind) + 6206 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6207 } 6208 StoreInst *SI = cast<StoreInst>(I); 6209 6210 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6211 return TTI.getAddressComputationCost(ValTy) + 6212 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6213 CostKind) + 6214 (isLoopInvariantStoreValue 6215 ? 0 6216 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6217 VF.getKnownMinValue() - 1)); 6218 } 6219 6220 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6221 ElementCount VF) { 6222 Type *ValTy = getMemInstValueType(I); 6223 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6224 const Align Alignment = getLoadStoreAlignment(I); 6225 const Value *Ptr = getLoadStorePointerOperand(I); 6226 6227 return TTI.getAddressComputationCost(VectorTy) + 6228 TTI.getGatherScatterOpCost( 6229 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6230 TargetTransformInfo::TCK_RecipThroughput, I); 6231 } 6232 6233 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6234 ElementCount VF) { 6235 Type *ValTy = getMemInstValueType(I); 6236 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6237 unsigned AS = getLoadStoreAddressSpace(I); 6238 6239 auto Group = getInterleavedAccessGroup(I); 6240 assert(Group && "Fail to get an interleaved access group."); 6241 6242 unsigned InterleaveFactor = Group->getFactor(); 6243 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6244 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6245 6246 // Holds the indices of existing members in an interleaved load group. 6247 // An interleaved store group doesn't need this as it doesn't allow gaps. 6248 SmallVector<unsigned, 4> Indices; 6249 if (isa<LoadInst>(I)) { 6250 for (unsigned i = 0; i < InterleaveFactor; i++) 6251 if (Group->getMember(i)) 6252 Indices.push_back(i); 6253 } 6254 6255 // Calculate the cost of the whole interleaved group. 6256 bool UseMaskForGaps = 6257 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6258 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6259 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6260 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6261 6262 if (Group->isReverse()) { 6263 // TODO: Add support for reversed masked interleaved access. 6264 assert(!Legal->isMaskRequired(I) && 6265 "Reverse masked interleaved access not supported."); 6266 Cost += Group->getNumMembers() * 6267 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6268 } 6269 return Cost; 6270 } 6271 6272 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6273 ElementCount VF) { 6274 // Calculate scalar cost only. Vectorization cost should be ready at this 6275 // moment. 6276 if (VF.isScalar()) { 6277 Type *ValTy = getMemInstValueType(I); 6278 const Align Alignment = getLoadStoreAlignment(I); 6279 unsigned AS = getLoadStoreAddressSpace(I); 6280 6281 return TTI.getAddressComputationCost(ValTy) + 6282 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6283 TTI::TCK_RecipThroughput, I); 6284 } 6285 return getWideningCost(I, VF); 6286 } 6287 6288 LoopVectorizationCostModel::VectorizationCostTy 6289 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6290 ElementCount VF) { 6291 assert(!VF.isScalable() && 6292 "the cost model is not yet implemented for scalable vectorization"); 6293 // If we know that this instruction will remain uniform, check the cost of 6294 // the scalar version. 6295 if (isUniformAfterVectorization(I, VF)) 6296 VF = ElementCount::getFixed(1); 6297 6298 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6299 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6300 6301 // Forced scalars do not have any scalarization overhead. 6302 auto ForcedScalar = ForcedScalars.find(VF); 6303 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6304 auto InstSet = ForcedScalar->second; 6305 if (InstSet.count(I)) 6306 return VectorizationCostTy( 6307 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6308 VF.getKnownMinValue()), 6309 false); 6310 } 6311 6312 Type *VectorTy; 6313 unsigned C = getInstructionCost(I, VF, VectorTy); 6314 6315 bool TypeNotScalarized = 6316 VF.isVector() && VectorTy->isVectorTy() && 6317 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6318 return VectorizationCostTy(C, TypeNotScalarized); 6319 } 6320 6321 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6322 ElementCount VF) { 6323 6324 assert(!VF.isScalable() && 6325 "cannot compute scalarization overhead for scalable vectorization"); 6326 if (VF.isScalar()) 6327 return 0; 6328 6329 unsigned Cost = 0; 6330 Type *RetTy = ToVectorTy(I->getType(), VF); 6331 if (!RetTy->isVoidTy() && 6332 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6333 Cost += TTI.getScalarizationOverhead( 6334 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6335 true, false); 6336 6337 // Some targets keep addresses scalar. 6338 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6339 return Cost; 6340 6341 // Some targets support efficient element stores. 6342 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6343 return Cost; 6344 6345 // Collect operands to consider. 6346 CallInst *CI = dyn_cast<CallInst>(I); 6347 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6348 6349 // Skip operands that do not require extraction/scalarization and do not incur 6350 // any overhead. 6351 return Cost + TTI.getOperandsScalarizationOverhead( 6352 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6353 } 6354 6355 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6356 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6357 if (VF.isScalar()) 6358 return; 6359 NumPredStores = 0; 6360 for (BasicBlock *BB : TheLoop->blocks()) { 6361 // For each instruction in the old loop. 6362 for (Instruction &I : *BB) { 6363 Value *Ptr = getLoadStorePointerOperand(&I); 6364 if (!Ptr) 6365 continue; 6366 6367 // TODO: We should generate better code and update the cost model for 6368 // predicated uniform stores. Today they are treated as any other 6369 // predicated store (see added test cases in 6370 // invariant-store-vectorization.ll). 6371 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6372 NumPredStores++; 6373 6374 if (Legal->isUniform(Ptr) && 6375 // Conditional loads and stores should be scalarized and predicated. 6376 // isScalarWithPredication cannot be used here since masked 6377 // gather/scatters are not considered scalar with predication. 6378 !Legal->blockNeedsPredication(I.getParent())) { 6379 // TODO: Avoid replicating loads and stores instead of 6380 // relying on instcombine to remove them. 6381 // Load: Scalar load + broadcast 6382 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6383 unsigned Cost = getUniformMemOpCost(&I, VF); 6384 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6385 continue; 6386 } 6387 6388 // We assume that widening is the best solution when possible. 6389 if (memoryInstructionCanBeWidened(&I, VF)) { 6390 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6391 int ConsecutiveStride = 6392 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6393 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6394 "Expected consecutive stride."); 6395 InstWidening Decision = 6396 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6397 setWideningDecision(&I, VF, Decision, Cost); 6398 continue; 6399 } 6400 6401 // Choose between Interleaving, Gather/Scatter or Scalarization. 6402 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6403 unsigned NumAccesses = 1; 6404 if (isAccessInterleaved(&I)) { 6405 auto Group = getInterleavedAccessGroup(&I); 6406 assert(Group && "Fail to get an interleaved access group."); 6407 6408 // Make one decision for the whole group. 6409 if (getWideningDecision(&I, VF) != CM_Unknown) 6410 continue; 6411 6412 NumAccesses = Group->getNumMembers(); 6413 if (interleavedAccessCanBeWidened(&I, VF)) 6414 InterleaveCost = getInterleaveGroupCost(&I, VF); 6415 } 6416 6417 unsigned GatherScatterCost = 6418 isLegalGatherOrScatter(&I) 6419 ? getGatherScatterCost(&I, VF) * NumAccesses 6420 : std::numeric_limits<unsigned>::max(); 6421 6422 unsigned ScalarizationCost = 6423 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6424 6425 // Choose better solution for the current VF, 6426 // write down this decision and use it during vectorization. 6427 unsigned Cost; 6428 InstWidening Decision; 6429 if (InterleaveCost <= GatherScatterCost && 6430 InterleaveCost < ScalarizationCost) { 6431 Decision = CM_Interleave; 6432 Cost = InterleaveCost; 6433 } else if (GatherScatterCost < ScalarizationCost) { 6434 Decision = CM_GatherScatter; 6435 Cost = GatherScatterCost; 6436 } else { 6437 Decision = CM_Scalarize; 6438 Cost = ScalarizationCost; 6439 } 6440 // If the instructions belongs to an interleave group, the whole group 6441 // receives the same decision. The whole group receives the cost, but 6442 // the cost will actually be assigned to one instruction. 6443 if (auto Group = getInterleavedAccessGroup(&I)) 6444 setWideningDecision(Group, VF, Decision, Cost); 6445 else 6446 setWideningDecision(&I, VF, Decision, Cost); 6447 } 6448 } 6449 6450 // Make sure that any load of address and any other address computation 6451 // remains scalar unless there is gather/scatter support. This avoids 6452 // inevitable extracts into address registers, and also has the benefit of 6453 // activating LSR more, since that pass can't optimize vectorized 6454 // addresses. 6455 if (TTI.prefersVectorizedAddressing()) 6456 return; 6457 6458 // Start with all scalar pointer uses. 6459 SmallPtrSet<Instruction *, 8> AddrDefs; 6460 for (BasicBlock *BB : TheLoop->blocks()) 6461 for (Instruction &I : *BB) { 6462 Instruction *PtrDef = 6463 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6464 if (PtrDef && TheLoop->contains(PtrDef) && 6465 getWideningDecision(&I, VF) != CM_GatherScatter) 6466 AddrDefs.insert(PtrDef); 6467 } 6468 6469 // Add all instructions used to generate the addresses. 6470 SmallVector<Instruction *, 4> Worklist; 6471 for (auto *I : AddrDefs) 6472 Worklist.push_back(I); 6473 while (!Worklist.empty()) { 6474 Instruction *I = Worklist.pop_back_val(); 6475 for (auto &Op : I->operands()) 6476 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6477 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6478 AddrDefs.insert(InstOp).second) 6479 Worklist.push_back(InstOp); 6480 } 6481 6482 for (auto *I : AddrDefs) { 6483 if (isa<LoadInst>(I)) { 6484 // Setting the desired widening decision should ideally be handled in 6485 // by cost functions, but since this involves the task of finding out 6486 // if the loaded register is involved in an address computation, it is 6487 // instead changed here when we know this is the case. 6488 InstWidening Decision = getWideningDecision(I, VF); 6489 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6490 // Scalarize a widened load of address. 6491 setWideningDecision( 6492 I, VF, CM_Scalarize, 6493 (VF.getKnownMinValue() * 6494 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6495 else if (auto Group = getInterleavedAccessGroup(I)) { 6496 // Scalarize an interleave group of address loads. 6497 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6498 if (Instruction *Member = Group->getMember(I)) 6499 setWideningDecision( 6500 Member, VF, CM_Scalarize, 6501 (VF.getKnownMinValue() * 6502 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6503 } 6504 } 6505 } else 6506 // Make sure I gets scalarized and a cost estimate without 6507 // scalarization overhead. 6508 ForcedScalars[VF].insert(I); 6509 } 6510 } 6511 6512 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6513 ElementCount VF, 6514 Type *&VectorTy) { 6515 Type *RetTy = I->getType(); 6516 if (canTruncateToMinimalBitwidth(I, VF)) 6517 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6518 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6519 auto SE = PSE.getSE(); 6520 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6521 6522 // TODO: We need to estimate the cost of intrinsic calls. 6523 switch (I->getOpcode()) { 6524 case Instruction::GetElementPtr: 6525 // We mark this instruction as zero-cost because the cost of GEPs in 6526 // vectorized code depends on whether the corresponding memory instruction 6527 // is scalarized or not. Therefore, we handle GEPs with the memory 6528 // instruction cost. 6529 return 0; 6530 case Instruction::Br: { 6531 // In cases of scalarized and predicated instructions, there will be VF 6532 // predicated blocks in the vectorized loop. Each branch around these 6533 // blocks requires also an extract of its vector compare i1 element. 6534 bool ScalarPredicatedBB = false; 6535 BranchInst *BI = cast<BranchInst>(I); 6536 if (VF.isVector() && BI->isConditional() && 6537 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6538 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6539 ScalarPredicatedBB = true; 6540 6541 if (ScalarPredicatedBB) { 6542 // Return cost for branches around scalarized and predicated blocks. 6543 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6544 auto *Vec_i1Ty = 6545 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6546 return (TTI.getScalarizationOverhead( 6547 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6548 false, true) + 6549 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6550 VF.getKnownMinValue())); 6551 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6552 // The back-edge branch will remain, as will all scalar branches. 6553 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6554 else 6555 // This branch will be eliminated by if-conversion. 6556 return 0; 6557 // Note: We currently assume zero cost for an unconditional branch inside 6558 // a predicated block since it will become a fall-through, although we 6559 // may decide in the future to call TTI for all branches. 6560 } 6561 case Instruction::PHI: { 6562 auto *Phi = cast<PHINode>(I); 6563 6564 // First-order recurrences are replaced by vector shuffles inside the loop. 6565 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6566 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6567 return TTI.getShuffleCost( 6568 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6569 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6570 6571 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6572 // converted into select instructions. We require N - 1 selects per phi 6573 // node, where N is the number of incoming values. 6574 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6575 return (Phi->getNumIncomingValues() - 1) * 6576 TTI.getCmpSelInstrCost( 6577 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6578 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6579 CostKind); 6580 6581 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6582 } 6583 case Instruction::UDiv: 6584 case Instruction::SDiv: 6585 case Instruction::URem: 6586 case Instruction::SRem: 6587 // If we have a predicated instruction, it may not be executed for each 6588 // vector lane. Get the scalarization cost and scale this amount by the 6589 // probability of executing the predicated block. If the instruction is not 6590 // predicated, we fall through to the next case. 6591 if (VF.isVector() && isScalarWithPredication(I)) { 6592 unsigned Cost = 0; 6593 6594 // These instructions have a non-void type, so account for the phi nodes 6595 // that we will create. This cost is likely to be zero. The phi node 6596 // cost, if any, should be scaled by the block probability because it 6597 // models a copy at the end of each predicated block. 6598 Cost += VF.getKnownMinValue() * 6599 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6600 6601 // The cost of the non-predicated instruction. 6602 Cost += VF.getKnownMinValue() * 6603 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6604 6605 // The cost of insertelement and extractelement instructions needed for 6606 // scalarization. 6607 Cost += getScalarizationOverhead(I, VF); 6608 6609 // Scale the cost by the probability of executing the predicated blocks. 6610 // This assumes the predicated block for each vector lane is equally 6611 // likely. 6612 return Cost / getReciprocalPredBlockProb(); 6613 } 6614 LLVM_FALLTHROUGH; 6615 case Instruction::Add: 6616 case Instruction::FAdd: 6617 case Instruction::Sub: 6618 case Instruction::FSub: 6619 case Instruction::Mul: 6620 case Instruction::FMul: 6621 case Instruction::FDiv: 6622 case Instruction::FRem: 6623 case Instruction::Shl: 6624 case Instruction::LShr: 6625 case Instruction::AShr: 6626 case Instruction::And: 6627 case Instruction::Or: 6628 case Instruction::Xor: { 6629 // Since we will replace the stride by 1 the multiplication should go away. 6630 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6631 return 0; 6632 // Certain instructions can be cheaper to vectorize if they have a constant 6633 // second vector operand. One example of this are shifts on x86. 6634 Value *Op2 = I->getOperand(1); 6635 TargetTransformInfo::OperandValueProperties Op2VP; 6636 TargetTransformInfo::OperandValueKind Op2VK = 6637 TTI.getOperandInfo(Op2, Op2VP); 6638 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6639 Op2VK = TargetTransformInfo::OK_UniformValue; 6640 6641 SmallVector<const Value *, 4> Operands(I->operand_values()); 6642 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6643 return N * TTI.getArithmeticInstrCost( 6644 I->getOpcode(), VectorTy, CostKind, 6645 TargetTransformInfo::OK_AnyValue, 6646 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6647 } 6648 case Instruction::FNeg: { 6649 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6650 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6651 return N * TTI.getArithmeticInstrCost( 6652 I->getOpcode(), VectorTy, CostKind, 6653 TargetTransformInfo::OK_AnyValue, 6654 TargetTransformInfo::OK_AnyValue, 6655 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6656 I->getOperand(0), I); 6657 } 6658 case Instruction::Select: { 6659 SelectInst *SI = cast<SelectInst>(I); 6660 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6661 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6662 Type *CondTy = SI->getCondition()->getType(); 6663 if (!ScalarCond) { 6664 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6665 CondTy = VectorType::get(CondTy, VF); 6666 } 6667 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6668 CostKind, I); 6669 } 6670 case Instruction::ICmp: 6671 case Instruction::FCmp: { 6672 Type *ValTy = I->getOperand(0)->getType(); 6673 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6674 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6675 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6676 VectorTy = ToVectorTy(ValTy, VF); 6677 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6678 I); 6679 } 6680 case Instruction::Store: 6681 case Instruction::Load: { 6682 ElementCount Width = VF; 6683 if (Width.isVector()) { 6684 InstWidening Decision = getWideningDecision(I, Width); 6685 assert(Decision != CM_Unknown && 6686 "CM decision should be taken at this point"); 6687 if (Decision == CM_Scalarize) 6688 Width = ElementCount::getFixed(1); 6689 } 6690 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6691 return getMemoryInstructionCost(I, VF); 6692 } 6693 case Instruction::ZExt: 6694 case Instruction::SExt: 6695 case Instruction::FPToUI: 6696 case Instruction::FPToSI: 6697 case Instruction::FPExt: 6698 case Instruction::PtrToInt: 6699 case Instruction::IntToPtr: 6700 case Instruction::SIToFP: 6701 case Instruction::UIToFP: 6702 case Instruction::Trunc: 6703 case Instruction::FPTrunc: 6704 case Instruction::BitCast: { 6705 // Computes the CastContextHint from a Load/Store instruction. 6706 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6707 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6708 "Expected a load or a store!"); 6709 6710 if (VF.isScalar() || !TheLoop->contains(I)) 6711 return TTI::CastContextHint::Normal; 6712 6713 switch (getWideningDecision(I, VF)) { 6714 case LoopVectorizationCostModel::CM_GatherScatter: 6715 return TTI::CastContextHint::GatherScatter; 6716 case LoopVectorizationCostModel::CM_Interleave: 6717 return TTI::CastContextHint::Interleave; 6718 case LoopVectorizationCostModel::CM_Scalarize: 6719 case LoopVectorizationCostModel::CM_Widen: 6720 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6721 : TTI::CastContextHint::Normal; 6722 case LoopVectorizationCostModel::CM_Widen_Reverse: 6723 return TTI::CastContextHint::Reversed; 6724 case LoopVectorizationCostModel::CM_Unknown: 6725 llvm_unreachable("Instr did not go through cost modelling?"); 6726 } 6727 6728 llvm_unreachable("Unhandled case!"); 6729 }; 6730 6731 unsigned Opcode = I->getOpcode(); 6732 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6733 // For Trunc, the context is the only user, which must be a StoreInst. 6734 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6735 if (I->hasOneUse()) 6736 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6737 CCH = ComputeCCH(Store); 6738 } 6739 // For Z/Sext, the context is the operand, which must be a LoadInst. 6740 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6741 Opcode == Instruction::FPExt) { 6742 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6743 CCH = ComputeCCH(Load); 6744 } 6745 6746 // We optimize the truncation of induction variables having constant 6747 // integer steps. The cost of these truncations is the same as the scalar 6748 // operation. 6749 if (isOptimizableIVTruncate(I, VF)) { 6750 auto *Trunc = cast<TruncInst>(I); 6751 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6752 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6753 } 6754 6755 Type *SrcScalarTy = I->getOperand(0)->getType(); 6756 Type *SrcVecTy = 6757 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6758 if (canTruncateToMinimalBitwidth(I, VF)) { 6759 // This cast is going to be shrunk. This may remove the cast or it might 6760 // turn it into slightly different cast. For example, if MinBW == 16, 6761 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6762 // 6763 // Calculate the modified src and dest types. 6764 Type *MinVecTy = VectorTy; 6765 if (Opcode == Instruction::Trunc) { 6766 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6767 VectorTy = 6768 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6769 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6770 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6771 VectorTy = 6772 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6773 } 6774 } 6775 6776 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6777 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6778 return N * 6779 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6780 } 6781 case Instruction::Call: { 6782 bool NeedToScalarize; 6783 CallInst *CI = cast<CallInst>(I); 6784 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6785 if (getVectorIntrinsicIDForCall(CI, TLI)) 6786 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6787 return CallCost; 6788 } 6789 default: 6790 // The cost of executing VF copies of the scalar instruction. This opcode 6791 // is unknown. Assume that it is the same as 'mul'. 6792 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6793 Instruction::Mul, VectorTy, CostKind) + 6794 getScalarizationOverhead(I, VF); 6795 } // end of switch. 6796 } 6797 6798 char LoopVectorize::ID = 0; 6799 6800 static const char lv_name[] = "Loop Vectorization"; 6801 6802 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6803 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6804 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6805 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6806 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6807 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6808 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6809 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6810 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6811 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6812 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6813 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6814 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6815 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6816 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6817 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6818 6819 namespace llvm { 6820 6821 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6822 6823 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6824 bool VectorizeOnlyWhenForced) { 6825 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6826 } 6827 6828 } // end namespace llvm 6829 6830 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6831 // Check if the pointer operand of a load or store instruction is 6832 // consecutive. 6833 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6834 return Legal->isConsecutivePtr(Ptr); 6835 return false; 6836 } 6837 6838 void LoopVectorizationCostModel::collectValuesToIgnore() { 6839 // Ignore ephemeral values. 6840 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6841 6842 // Ignore type-promoting instructions we identified during reduction 6843 // detection. 6844 for (auto &Reduction : Legal->getReductionVars()) { 6845 RecurrenceDescriptor &RedDes = Reduction.second; 6846 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6847 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6848 } 6849 // Ignore type-casting instructions we identified during induction 6850 // detection. 6851 for (auto &Induction : Legal->getInductionVars()) { 6852 InductionDescriptor &IndDes = Induction.second; 6853 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6854 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6855 } 6856 } 6857 6858 void LoopVectorizationCostModel::collectInLoopReductions() { 6859 // For the moment, without predicated reduction instructions, we do not 6860 // support inloop reductions whilst folding the tail, and hence in those cases 6861 // all reductions are currently out of the loop. 6862 if (!PreferInLoopReductions || foldTailByMasking()) 6863 return; 6864 6865 for (auto &Reduction : Legal->getReductionVars()) { 6866 PHINode *Phi = Reduction.first; 6867 RecurrenceDescriptor &RdxDesc = Reduction.second; 6868 6869 // We don't collect reductions that are type promoted (yet). 6870 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6871 continue; 6872 6873 // Check that we can correctly put the reductions into the loop, by 6874 // finding the chain of operations that leads from the phi to the loop 6875 // exit value. 6876 SmallVector<Instruction *, 4> ReductionOperations = 6877 RdxDesc.getReductionOpChain(Phi, TheLoop); 6878 bool InLoop = !ReductionOperations.empty(); 6879 if (InLoop) 6880 InLoopReductionChains[Phi] = ReductionOperations; 6881 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6882 << " reduction for phi: " << *Phi << "\n"); 6883 } 6884 } 6885 6886 // TODO: we could return a pair of values that specify the max VF and 6887 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6888 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6889 // doesn't have a cost model that can choose which plan to execute if 6890 // more than one is generated. 6891 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6892 LoopVectorizationCostModel &CM) { 6893 unsigned WidestType; 6894 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6895 return WidestVectorRegBits / WidestType; 6896 } 6897 6898 VectorizationFactor 6899 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6900 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6901 ElementCount VF = UserVF; 6902 // Outer loop handling: They may require CFG and instruction level 6903 // transformations before even evaluating whether vectorization is profitable. 6904 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6905 // the vectorization pipeline. 6906 if (!OrigLoop->empty()) { 6907 // If the user doesn't provide a vectorization factor, determine a 6908 // reasonable one. 6909 if (UserVF.isZero()) { 6910 VF = ElementCount::getFixed( 6911 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6912 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6913 6914 // Make sure we have a VF > 1 for stress testing. 6915 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6916 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6917 << "overriding computed VF.\n"); 6918 VF = ElementCount::getFixed(4); 6919 } 6920 } 6921 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6922 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6923 "VF needs to be a power of two"); 6924 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6925 << "VF " << VF << " to build VPlans.\n"); 6926 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6927 6928 // For VPlan build stress testing, we bail out after VPlan construction. 6929 if (VPlanBuildStressTest) 6930 return VectorizationFactor::Disabled(); 6931 6932 return {VF, 0 /*Cost*/}; 6933 } 6934 6935 LLVM_DEBUG( 6936 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6937 "VPlan-native path.\n"); 6938 return VectorizationFactor::Disabled(); 6939 } 6940 6941 Optional<VectorizationFactor> 6942 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6943 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6944 assert(OrigLoop->empty() && "Inner loop expected."); 6945 Optional<unsigned> MaybeMaxVF = 6946 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6947 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6948 return None; 6949 6950 // Invalidate interleave groups if all blocks of loop will be predicated. 6951 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6952 !useMaskedInterleavedAccesses(*TTI)) { 6953 LLVM_DEBUG( 6954 dbgs() 6955 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6956 "which requires masked-interleaved support.\n"); 6957 if (CM.InterleaveInfo.invalidateGroups()) 6958 // Invalidating interleave groups also requires invalidating all decisions 6959 // based on them, which includes widening decisions and uniform and scalar 6960 // values. 6961 CM.invalidateCostModelingDecisions(); 6962 } 6963 6964 if (!UserVF.isZero()) { 6965 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6966 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6967 "VF needs to be a power of two"); 6968 // Collect the instructions (and their associated costs) that will be more 6969 // profitable to scalarize. 6970 CM.selectUserVectorizationFactor(UserVF); 6971 CM.collectInLoopReductions(); 6972 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 6973 UserVF.getKnownMinValue()); 6974 LLVM_DEBUG(printPlans(dbgs())); 6975 return {{UserVF, 0}}; 6976 } 6977 6978 unsigned MaxVF = MaybeMaxVF.getValue(); 6979 assert(MaxVF != 0 && "MaxVF is zero."); 6980 6981 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6982 // Collect Uniform and Scalar instructions after vectorization with VF. 6983 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 6984 6985 // Collect the instructions (and their associated costs) that will be more 6986 // profitable to scalarize. 6987 if (VF > 1) 6988 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 6989 } 6990 6991 CM.collectInLoopReductions(); 6992 6993 buildVPlansWithVPRecipes(1, MaxVF); 6994 LLVM_DEBUG(printPlans(dbgs())); 6995 if (MaxVF == 1) 6996 return VectorizationFactor::Disabled(); 6997 6998 // Select the optimal vectorization factor. 6999 return CM.selectVectorizationFactor(MaxVF); 7000 } 7001 7002 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7003 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7004 << '\n'); 7005 BestVF = VF; 7006 BestUF = UF; 7007 7008 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7009 return !Plan->hasVF(VF); 7010 }); 7011 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7012 } 7013 7014 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7015 DominatorTree *DT) { 7016 // Perform the actual loop transformation. 7017 7018 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7019 VPCallbackILV CallbackILV(ILV); 7020 7021 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7022 7023 VPTransformState State{*BestVF, BestUF, LI, 7024 DT, ILV.Builder, ILV.VectorLoopValueMap, 7025 &ILV, CallbackILV}; 7026 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7027 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7028 State.CanonicalIV = ILV.Induction; 7029 7030 //===------------------------------------------------===// 7031 // 7032 // Notice: any optimization or new instruction that go 7033 // into the code below should also be implemented in 7034 // the cost-model. 7035 // 7036 //===------------------------------------------------===// 7037 7038 // 2. Copy and widen instructions from the old loop into the new loop. 7039 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7040 VPlans.front()->execute(&State); 7041 7042 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7043 // predication, updating analyses. 7044 ILV.fixVectorizedLoop(); 7045 } 7046 7047 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7048 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7049 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7050 7051 // We create new control-flow for the vectorized loop, so the original 7052 // condition will be dead after vectorization if it's only used by the 7053 // branch. 7054 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7055 if (Cmp && Cmp->hasOneUse()) 7056 DeadInstructions.insert(Cmp); 7057 7058 // We create new "steps" for induction variable updates to which the original 7059 // induction variables map. An original update instruction will be dead if 7060 // all its users except the induction variable are dead. 7061 for (auto &Induction : Legal->getInductionVars()) { 7062 PHINode *Ind = Induction.first; 7063 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7064 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7065 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7066 })) 7067 DeadInstructions.insert(IndUpdate); 7068 7069 // We record as "Dead" also the type-casting instructions we had identified 7070 // during induction analysis. We don't need any handling for them in the 7071 // vectorized loop because we have proven that, under a proper runtime 7072 // test guarding the vectorized loop, the value of the phi, and the casted 7073 // value of the phi, are the same. The last instruction in this casting chain 7074 // will get its scalar/vector/widened def from the scalar/vector/widened def 7075 // of the respective phi node. Any other casts in the induction def-use chain 7076 // have no other uses outside the phi update chain, and will be ignored. 7077 InductionDescriptor &IndDes = Induction.second; 7078 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7079 DeadInstructions.insert(Casts.begin(), Casts.end()); 7080 } 7081 } 7082 7083 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7084 7085 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7086 7087 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7088 Instruction::BinaryOps BinOp) { 7089 // When unrolling and the VF is 1, we only need to add a simple scalar. 7090 Type *Ty = Val->getType(); 7091 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7092 7093 if (Ty->isFloatingPointTy()) { 7094 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7095 7096 // Floating point operations had to be 'fast' to enable the unrolling. 7097 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7098 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7099 } 7100 Constant *C = ConstantInt::get(Ty, StartIdx); 7101 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7102 } 7103 7104 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7105 SmallVector<Metadata *, 4> MDs; 7106 // Reserve first location for self reference to the LoopID metadata node. 7107 MDs.push_back(nullptr); 7108 bool IsUnrollMetadata = false; 7109 MDNode *LoopID = L->getLoopID(); 7110 if (LoopID) { 7111 // First find existing loop unrolling disable metadata. 7112 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7113 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7114 if (MD) { 7115 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7116 IsUnrollMetadata = 7117 S && S->getString().startswith("llvm.loop.unroll.disable"); 7118 } 7119 MDs.push_back(LoopID->getOperand(i)); 7120 } 7121 } 7122 7123 if (!IsUnrollMetadata) { 7124 // Add runtime unroll disable metadata. 7125 LLVMContext &Context = L->getHeader()->getContext(); 7126 SmallVector<Metadata *, 1> DisableOperands; 7127 DisableOperands.push_back( 7128 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7129 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7130 MDs.push_back(DisableNode); 7131 MDNode *NewLoopID = MDNode::get(Context, MDs); 7132 // Set operand 0 to refer to the loop id itself. 7133 NewLoopID->replaceOperandWith(0, NewLoopID); 7134 L->setLoopID(NewLoopID); 7135 } 7136 } 7137 7138 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7139 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7140 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7141 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7142 7143 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7144 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7145 Range.End = TmpVF; 7146 break; 7147 } 7148 7149 return PredicateAtRangeStart; 7150 } 7151 7152 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7153 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7154 /// of VF's starting at a given VF and extending it as much as possible. Each 7155 /// vectorization decision can potentially shorten this sub-range during 7156 /// buildVPlan(). 7157 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7158 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7159 VFRange SubRange = {VF, MaxVF + 1}; 7160 VPlans.push_back(buildVPlan(SubRange)); 7161 VF = SubRange.End; 7162 } 7163 } 7164 7165 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7166 VPlanPtr &Plan) { 7167 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7168 7169 // Look for cached value. 7170 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7171 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7172 if (ECEntryIt != EdgeMaskCache.end()) 7173 return ECEntryIt->second; 7174 7175 VPValue *SrcMask = createBlockInMask(Src, Plan); 7176 7177 // The terminator has to be a branch inst! 7178 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7179 assert(BI && "Unexpected terminator found"); 7180 7181 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7182 return EdgeMaskCache[Edge] = SrcMask; 7183 7184 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7185 assert(EdgeMask && "No Edge Mask found for condition"); 7186 7187 if (BI->getSuccessor(0) != Dst) 7188 EdgeMask = Builder.createNot(EdgeMask); 7189 7190 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7191 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7192 7193 return EdgeMaskCache[Edge] = EdgeMask; 7194 } 7195 7196 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7197 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7198 7199 // Look for cached value. 7200 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7201 if (BCEntryIt != BlockMaskCache.end()) 7202 return BCEntryIt->second; 7203 7204 // All-one mask is modelled as no-mask following the convention for masked 7205 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7206 VPValue *BlockMask = nullptr; 7207 7208 if (OrigLoop->getHeader() == BB) { 7209 if (!CM.blockNeedsPredication(BB)) 7210 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7211 7212 // Introduce the early-exit compare IV <= BTC to form header block mask. 7213 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7214 // Start by constructing the desired canonical IV. 7215 VPValue *IV = nullptr; 7216 if (Legal->getPrimaryInduction()) 7217 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7218 else { 7219 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7220 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7221 IV = IVRecipe->getVPValue(); 7222 } 7223 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7224 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7225 7226 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7227 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7228 // as a second argument, we only pass the IV here and extract the 7229 // tripcount from the transform state where codegen of the VP instructions 7230 // happen. 7231 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7232 } else { 7233 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7234 } 7235 return BlockMaskCache[BB] = BlockMask; 7236 } 7237 7238 // This is the block mask. We OR all incoming edges. 7239 for (auto *Predecessor : predecessors(BB)) { 7240 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7241 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7242 return BlockMaskCache[BB] = EdgeMask; 7243 7244 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7245 BlockMask = EdgeMask; 7246 continue; 7247 } 7248 7249 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7250 } 7251 7252 return BlockMaskCache[BB] = BlockMask; 7253 } 7254 7255 VPWidenMemoryInstructionRecipe * 7256 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7257 VPlanPtr &Plan) { 7258 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7259 "Must be called with either a load or store"); 7260 7261 auto willWiden = [&](ElementCount VF) -> bool { 7262 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7263 if (VF.isScalar()) 7264 return false; 7265 LoopVectorizationCostModel::InstWidening Decision = 7266 CM.getWideningDecision(I, VF); 7267 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7268 "CM decision should be taken at this point."); 7269 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7270 return true; 7271 if (CM.isScalarAfterVectorization(I, VF) || 7272 CM.isProfitableToScalarize(I, VF)) 7273 return false; 7274 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7275 }; 7276 7277 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7278 return nullptr; 7279 7280 VPValue *Mask = nullptr; 7281 if (Legal->isMaskRequired(I)) 7282 Mask = createBlockInMask(I->getParent(), Plan); 7283 7284 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7285 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7286 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7287 7288 StoreInst *Store = cast<StoreInst>(I); 7289 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7290 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7291 } 7292 7293 VPWidenIntOrFpInductionRecipe * 7294 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7295 // Check if this is an integer or fp induction. If so, build the recipe that 7296 // produces its scalar and vector values. 7297 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7298 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7299 II.getKind() == InductionDescriptor::IK_FpInduction) 7300 return new VPWidenIntOrFpInductionRecipe(Phi); 7301 7302 return nullptr; 7303 } 7304 7305 VPWidenIntOrFpInductionRecipe * 7306 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7307 VFRange &Range) const { 7308 // Optimize the special case where the source is a constant integer 7309 // induction variable. Notice that we can only optimize the 'trunc' case 7310 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7311 // (c) other casts depend on pointer size. 7312 7313 // Determine whether \p K is a truncation based on an induction variable that 7314 // can be optimized. 7315 auto isOptimizableIVTruncate = 7316 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7317 return [=](ElementCount VF) -> bool { 7318 return CM.isOptimizableIVTruncate(K, VF); 7319 }; 7320 }; 7321 7322 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7323 isOptimizableIVTruncate(I), Range)) 7324 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7325 I); 7326 return nullptr; 7327 } 7328 7329 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7330 // We know that all PHIs in non-header blocks are converted into selects, so 7331 // we don't have to worry about the insertion order and we can just use the 7332 // builder. At this point we generate the predication tree. There may be 7333 // duplications since this is a simple recursive scan, but future 7334 // optimizations will clean it up. 7335 7336 SmallVector<VPValue *, 2> Operands; 7337 unsigned NumIncoming = Phi->getNumIncomingValues(); 7338 for (unsigned In = 0; In < NumIncoming; In++) { 7339 VPValue *EdgeMask = 7340 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7341 assert((EdgeMask || NumIncoming == 1) && 7342 "Multiple predecessors with one having a full mask"); 7343 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7344 if (EdgeMask) 7345 Operands.push_back(EdgeMask); 7346 } 7347 return new VPBlendRecipe(Phi, Operands); 7348 } 7349 7350 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7351 VPlan &Plan) const { 7352 7353 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7354 [this, CI](ElementCount VF) { 7355 return CM.isScalarWithPredication(CI, VF); 7356 }, 7357 Range); 7358 7359 if (IsPredicated) 7360 return nullptr; 7361 7362 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7363 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7364 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7365 return nullptr; 7366 7367 auto willWiden = [&](ElementCount VF) -> bool { 7368 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7369 // The following case may be scalarized depending on the VF. 7370 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7371 // version of the instruction. 7372 // Is it beneficial to perform intrinsic call compared to lib call? 7373 bool NeedToScalarize = false; 7374 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7375 bool UseVectorIntrinsic = 7376 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7377 return UseVectorIntrinsic || !NeedToScalarize; 7378 }; 7379 7380 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7381 return nullptr; 7382 7383 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7384 } 7385 7386 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7387 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7388 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7389 // Instruction should be widened, unless it is scalar after vectorization, 7390 // scalarization is profitable or it is predicated. 7391 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7392 return CM.isScalarAfterVectorization(I, VF) || 7393 CM.isProfitableToScalarize(I, VF) || 7394 CM.isScalarWithPredication(I, VF); 7395 }; 7396 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7397 Range); 7398 } 7399 7400 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7401 auto IsVectorizableOpcode = [](unsigned Opcode) { 7402 switch (Opcode) { 7403 case Instruction::Add: 7404 case Instruction::And: 7405 case Instruction::AShr: 7406 case Instruction::BitCast: 7407 case Instruction::FAdd: 7408 case Instruction::FCmp: 7409 case Instruction::FDiv: 7410 case Instruction::FMul: 7411 case Instruction::FNeg: 7412 case Instruction::FPExt: 7413 case Instruction::FPToSI: 7414 case Instruction::FPToUI: 7415 case Instruction::FPTrunc: 7416 case Instruction::FRem: 7417 case Instruction::FSub: 7418 case Instruction::ICmp: 7419 case Instruction::IntToPtr: 7420 case Instruction::LShr: 7421 case Instruction::Mul: 7422 case Instruction::Or: 7423 case Instruction::PtrToInt: 7424 case Instruction::SDiv: 7425 case Instruction::Select: 7426 case Instruction::SExt: 7427 case Instruction::Shl: 7428 case Instruction::SIToFP: 7429 case Instruction::SRem: 7430 case Instruction::Sub: 7431 case Instruction::Trunc: 7432 case Instruction::UDiv: 7433 case Instruction::UIToFP: 7434 case Instruction::URem: 7435 case Instruction::Xor: 7436 case Instruction::ZExt: 7437 return true; 7438 } 7439 return false; 7440 }; 7441 7442 if (!IsVectorizableOpcode(I->getOpcode())) 7443 return nullptr; 7444 7445 // Success: widen this instruction. 7446 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7447 } 7448 7449 VPBasicBlock *VPRecipeBuilder::handleReplication( 7450 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7451 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7452 VPlanPtr &Plan) { 7453 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7454 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7455 Range); 7456 7457 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7458 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7459 Range); 7460 7461 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7462 IsUniform, IsPredicated); 7463 setRecipe(I, Recipe); 7464 7465 // Find if I uses a predicated instruction. If so, it will use its scalar 7466 // value. Avoid hoisting the insert-element which packs the scalar value into 7467 // a vector value, as that happens iff all users use the vector value. 7468 for (auto &Op : I->operands()) 7469 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7470 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7471 PredInst2Recipe[PredInst]->setAlsoPack(false); 7472 7473 // Finalize the recipe for Instr, first if it is not predicated. 7474 if (!IsPredicated) { 7475 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7476 VPBB->appendRecipe(Recipe); 7477 return VPBB; 7478 } 7479 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7480 assert(VPBB->getSuccessors().empty() && 7481 "VPBB has successors when handling predicated replication."); 7482 // Record predicated instructions for above packing optimizations. 7483 PredInst2Recipe[I] = Recipe; 7484 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7485 VPBlockUtils::insertBlockAfter(Region, VPBB); 7486 auto *RegSucc = new VPBasicBlock(); 7487 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7488 return RegSucc; 7489 } 7490 7491 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7492 VPRecipeBase *PredRecipe, 7493 VPlanPtr &Plan) { 7494 // Instructions marked for predication are replicated and placed under an 7495 // if-then construct to prevent side-effects. 7496 7497 // Generate recipes to compute the block mask for this region. 7498 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7499 7500 // Build the triangular if-then region. 7501 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7502 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7503 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7504 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7505 auto *PHIRecipe = 7506 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7507 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7508 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7509 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7510 7511 // Note: first set Entry as region entry and then connect successors starting 7512 // from it in order, to propagate the "parent" of each VPBasicBlock. 7513 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7514 VPBlockUtils::connectBlocks(Pred, Exit); 7515 7516 return Region; 7517 } 7518 7519 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7520 VFRange &Range, 7521 VPlanPtr &Plan) { 7522 // First, check for specific widening recipes that deal with calls, memory 7523 // operations, inductions and Phi nodes. 7524 if (auto *CI = dyn_cast<CallInst>(Instr)) 7525 return tryToWidenCall(CI, Range, *Plan); 7526 7527 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7528 return tryToWidenMemory(Instr, Range, Plan); 7529 7530 VPRecipeBase *Recipe; 7531 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7532 if (Phi->getParent() != OrigLoop->getHeader()) 7533 return tryToBlend(Phi, Plan); 7534 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7535 return Recipe; 7536 return new VPWidenPHIRecipe(Phi); 7537 } 7538 7539 if (isa<TruncInst>(Instr) && 7540 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7541 return Recipe; 7542 7543 if (!shouldWiden(Instr, Range)) 7544 return nullptr; 7545 7546 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7547 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7548 OrigLoop); 7549 7550 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7551 bool InvariantCond = 7552 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7553 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7554 InvariantCond); 7555 } 7556 7557 return tryToWiden(Instr, *Plan); 7558 } 7559 7560 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7561 unsigned MaxVF) { 7562 assert(OrigLoop->empty() && "Inner loop expected."); 7563 7564 // Collect conditions feeding internal conditional branches; they need to be 7565 // represented in VPlan for it to model masking. 7566 SmallPtrSet<Value *, 1> NeedDef; 7567 7568 auto *Latch = OrigLoop->getLoopLatch(); 7569 for (BasicBlock *BB : OrigLoop->blocks()) { 7570 if (BB == Latch) 7571 continue; 7572 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7573 if (Branch && Branch->isConditional()) 7574 NeedDef.insert(Branch->getCondition()); 7575 } 7576 7577 // If the tail is to be folded by masking, the primary induction variable, if 7578 // exists needs to be represented in VPlan for it to model early-exit masking. 7579 // Also, both the Phi and the live-out instruction of each reduction are 7580 // required in order to introduce a select between them in VPlan. 7581 if (CM.foldTailByMasking()) { 7582 if (Legal->getPrimaryInduction()) 7583 NeedDef.insert(Legal->getPrimaryInduction()); 7584 for (auto &Reduction : Legal->getReductionVars()) { 7585 NeedDef.insert(Reduction.first); 7586 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7587 } 7588 } 7589 7590 // Collect instructions from the original loop that will become trivially dead 7591 // in the vectorized loop. We don't need to vectorize these instructions. For 7592 // example, original induction update instructions can become dead because we 7593 // separately emit induction "steps" when generating code for the new loop. 7594 // Similarly, we create a new latch condition when setting up the structure 7595 // of the new loop, so the old one can become dead. 7596 SmallPtrSet<Instruction *, 4> DeadInstructions; 7597 collectTriviallyDeadInstructions(DeadInstructions); 7598 7599 // Add assume instructions we need to drop to DeadInstructions, to prevent 7600 // them from being added to the VPlan. 7601 // TODO: We only need to drop assumes in blocks that get flattend. If the 7602 // control flow is preserved, we should keep them. 7603 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7604 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7605 7606 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7607 // Dead instructions do not need sinking. Remove them from SinkAfter. 7608 for (Instruction *I : DeadInstructions) 7609 SinkAfter.erase(I); 7610 7611 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7612 VFRange SubRange = {VF, MaxVF + 1}; 7613 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7614 DeadInstructions, SinkAfter)); 7615 VF = SubRange.End; 7616 } 7617 } 7618 7619 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7620 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7621 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7622 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7623 7624 // Hold a mapping from predicated instructions to their recipes, in order to 7625 // fix their AlsoPack behavior if a user is determined to replicate and use a 7626 // scalar instead of vector value. 7627 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7628 7629 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7630 7631 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7632 7633 // --------------------------------------------------------------------------- 7634 // Pre-construction: record ingredients whose recipes we'll need to further 7635 // process after constructing the initial VPlan. 7636 // --------------------------------------------------------------------------- 7637 7638 // Mark instructions we'll need to sink later and their targets as 7639 // ingredients whose recipe we'll need to record. 7640 for (auto &Entry : SinkAfter) { 7641 RecipeBuilder.recordRecipeOf(Entry.first); 7642 RecipeBuilder.recordRecipeOf(Entry.second); 7643 } 7644 for (auto &Reduction : CM.getInLoopReductionChains()) { 7645 PHINode *Phi = Reduction.first; 7646 RecurrenceDescriptor::RecurrenceKind Kind = 7647 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7648 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7649 7650 RecipeBuilder.recordRecipeOf(Phi); 7651 for (auto &R : ReductionOperations) { 7652 RecipeBuilder.recordRecipeOf(R); 7653 // For min/max reducitons, where we have a pair of icmp/select, we also 7654 // need to record the ICmp recipe, so it can be removed later. 7655 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7656 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7657 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7658 } 7659 } 7660 } 7661 7662 // For each interleave group which is relevant for this (possibly trimmed) 7663 // Range, add it to the set of groups to be later applied to the VPlan and add 7664 // placeholders for its members' Recipes which we'll be replacing with a 7665 // single VPInterleaveRecipe. 7666 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7667 auto applyIG = [IG, this](ElementCount VF) -> bool { 7668 return (VF.isVector() && // Query is illegal for VF == 1 7669 CM.getWideningDecision(IG->getInsertPos(), VF) == 7670 LoopVectorizationCostModel::CM_Interleave); 7671 }; 7672 if (!getDecisionAndClampRange(applyIG, Range)) 7673 continue; 7674 InterleaveGroups.insert(IG); 7675 for (unsigned i = 0; i < IG->getFactor(); i++) 7676 if (Instruction *Member = IG->getMember(i)) 7677 RecipeBuilder.recordRecipeOf(Member); 7678 }; 7679 7680 // --------------------------------------------------------------------------- 7681 // Build initial VPlan: Scan the body of the loop in a topological order to 7682 // visit each basic block after having visited its predecessor basic blocks. 7683 // --------------------------------------------------------------------------- 7684 7685 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7686 auto Plan = std::make_unique<VPlan>(); 7687 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7688 Plan->setEntry(VPBB); 7689 7690 // Represent values that will have defs inside VPlan. 7691 for (Value *V : NeedDef) 7692 Plan->addVPValue(V); 7693 7694 // Scan the body of the loop in a topological order to visit each basic block 7695 // after having visited its predecessor basic blocks. 7696 LoopBlocksDFS DFS(OrigLoop); 7697 DFS.perform(LI); 7698 7699 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7700 // Relevant instructions from basic block BB will be grouped into VPRecipe 7701 // ingredients and fill a new VPBasicBlock. 7702 unsigned VPBBsForBB = 0; 7703 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7704 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7705 VPBB = FirstVPBBForBB; 7706 Builder.setInsertPoint(VPBB); 7707 7708 // Introduce each ingredient into VPlan. 7709 // TODO: Model and preserve debug instrinsics in VPlan. 7710 for (Instruction &I : BB->instructionsWithoutDebug()) { 7711 Instruction *Instr = &I; 7712 7713 // First filter out irrelevant instructions, to ensure no recipes are 7714 // built for them. 7715 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7716 continue; 7717 7718 if (auto Recipe = 7719 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7720 RecipeBuilder.setRecipe(Instr, Recipe); 7721 VPBB->appendRecipe(Recipe); 7722 continue; 7723 } 7724 7725 // Otherwise, if all widening options failed, Instruction is to be 7726 // replicated. This may create a successor for VPBB. 7727 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7728 Instr, Range, VPBB, PredInst2Recipe, Plan); 7729 if (NextVPBB != VPBB) { 7730 VPBB = NextVPBB; 7731 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7732 : ""); 7733 } 7734 } 7735 } 7736 7737 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7738 // may also be empty, such as the last one VPBB, reflecting original 7739 // basic-blocks with no recipes. 7740 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7741 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7742 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7743 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7744 delete PreEntry; 7745 7746 // --------------------------------------------------------------------------- 7747 // Transform initial VPlan: Apply previously taken decisions, in order, to 7748 // bring the VPlan to its final state. 7749 // --------------------------------------------------------------------------- 7750 7751 // Apply Sink-After legal constraints. 7752 for (auto &Entry : SinkAfter) { 7753 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7754 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7755 Sink->moveAfter(Target); 7756 } 7757 7758 // Interleave memory: for each Interleave Group we marked earlier as relevant 7759 // for this VPlan, replace the Recipes widening its memory instructions with a 7760 // single VPInterleaveRecipe at its insertion point. 7761 for (auto IG : InterleaveGroups) { 7762 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7763 RecipeBuilder.getRecipe(IG->getInsertPos())); 7764 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7765 ->insertBefore(Recipe); 7766 7767 for (unsigned i = 0; i < IG->getFactor(); ++i) 7768 if (Instruction *Member = IG->getMember(i)) { 7769 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7770 } 7771 } 7772 7773 // Adjust the recipes for any inloop reductions. 7774 if (Range.Start > 1) 7775 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7776 7777 // Finally, if tail is folded by masking, introduce selects between the phi 7778 // and the live-out instruction of each reduction, at the end of the latch. 7779 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7780 Builder.setInsertPoint(VPBB); 7781 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7782 for (auto &Reduction : Legal->getReductionVars()) { 7783 assert(!CM.isInLoopReduction(Reduction.first) && 7784 "Didn't expect inloop tail folded reduction yet!"); 7785 VPValue *Phi = Plan->getVPValue(Reduction.first); 7786 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7787 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7788 } 7789 } 7790 7791 std::string PlanName; 7792 raw_string_ostream RSO(PlanName); 7793 ElementCount VF = ElementCount::getFixed(Range.Start); 7794 Plan->addVF(VF); 7795 RSO << "Initial VPlan for VF={" << VF; 7796 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7797 Plan->addVF(VF); 7798 RSO << "," << VF; 7799 } 7800 RSO << "},UF>=1"; 7801 RSO.flush(); 7802 Plan->setName(PlanName); 7803 7804 return Plan; 7805 } 7806 7807 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7808 // Outer loop handling: They may require CFG and instruction level 7809 // transformations before even evaluating whether vectorization is profitable. 7810 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7811 // the vectorization pipeline. 7812 assert(!OrigLoop->empty()); 7813 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7814 7815 // Create new empty VPlan 7816 auto Plan = std::make_unique<VPlan>(); 7817 7818 // Build hierarchical CFG 7819 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7820 HCFGBuilder.buildHierarchicalCFG(); 7821 7822 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7823 Plan->addVF(ElementCount::getFixed(VF)); 7824 7825 if (EnableVPlanPredication) { 7826 VPlanPredicator VPP(*Plan); 7827 VPP.predicate(); 7828 7829 // Avoid running transformation to recipes until masked code generation in 7830 // VPlan-native path is in place. 7831 return Plan; 7832 } 7833 7834 SmallPtrSet<Instruction *, 1> DeadInstructions; 7835 VPlanTransforms::VPInstructionsToVPRecipes( 7836 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7837 return Plan; 7838 } 7839 7840 // Adjust the recipes for any inloop reductions. The chain of instructions 7841 // leading from the loop exit instr to the phi need to be converted to 7842 // reductions, with one operand being vector and the other being the scalar 7843 // reduction chain. 7844 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7845 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7846 for (auto &Reduction : CM.getInLoopReductionChains()) { 7847 PHINode *Phi = Reduction.first; 7848 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7849 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7850 7851 // ReductionOperations are orders top-down from the phi's use to the 7852 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7853 // which of the two operands will remain scalar and which will be reduced. 7854 // For minmax the chain will be the select instructions. 7855 Instruction *Chain = Phi; 7856 for (Instruction *R : ReductionOperations) { 7857 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7858 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7859 7860 VPValue *ChainOp = Plan->getVPValue(Chain); 7861 unsigned FirstOpId; 7862 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7863 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7864 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7865 "Expected to replace a VPWidenSelectSC"); 7866 FirstOpId = 1; 7867 } else { 7868 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7869 "Expected to replace a VPWidenSC"); 7870 FirstOpId = 0; 7871 } 7872 unsigned VecOpId = 7873 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7874 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7875 7876 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7877 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7878 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7879 WidenRecipe->eraseFromParent(); 7880 7881 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7882 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7883 VPRecipeBase *CompareRecipe = 7884 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7885 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7886 "Expected to replace a VPWidenSC"); 7887 CompareRecipe->eraseFromParent(); 7888 } 7889 Chain = R; 7890 } 7891 } 7892 } 7893 7894 Value* LoopVectorizationPlanner::VPCallbackILV:: 7895 getOrCreateVectorValues(Value *V, unsigned Part) { 7896 return ILV.getOrCreateVectorValue(V, Part); 7897 } 7898 7899 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7900 Value *V, const VPIteration &Instance) { 7901 return ILV.getOrCreateScalarValue(V, Instance); 7902 } 7903 7904 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7905 VPSlotTracker &SlotTracker) const { 7906 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7907 IG->getInsertPos()->printAsOperand(O, false); 7908 O << ", "; 7909 getAddr()->printAsOperand(O, SlotTracker); 7910 VPValue *Mask = getMask(); 7911 if (Mask) { 7912 O << ", "; 7913 Mask->printAsOperand(O, SlotTracker); 7914 } 7915 for (unsigned i = 0; i < IG->getFactor(); ++i) 7916 if (Instruction *I = IG->getMember(i)) 7917 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7918 } 7919 7920 void VPWidenCallRecipe::execute(VPTransformState &State) { 7921 State.ILV->widenCallInstruction(Ingredient, User, State); 7922 } 7923 7924 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7925 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7926 } 7927 7928 void VPWidenRecipe::execute(VPTransformState &State) { 7929 State.ILV->widenInstruction(Ingredient, User, State); 7930 } 7931 7932 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7933 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7934 IsIndexLoopInvariant, State); 7935 } 7936 7937 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7938 assert(!State.Instance && "Int or FP induction being replicated."); 7939 State.ILV->widenIntOrFpInduction(IV, Trunc); 7940 } 7941 7942 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7943 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7944 } 7945 7946 void VPBlendRecipe::execute(VPTransformState &State) { 7947 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7948 // We know that all PHIs in non-header blocks are converted into 7949 // selects, so we don't have to worry about the insertion order and we 7950 // can just use the builder. 7951 // At this point we generate the predication tree. There may be 7952 // duplications since this is a simple recursive scan, but future 7953 // optimizations will clean it up. 7954 7955 unsigned NumIncoming = getNumIncomingValues(); 7956 7957 // Generate a sequence of selects of the form: 7958 // SELECT(Mask3, In3, 7959 // SELECT(Mask2, In2, 7960 // SELECT(Mask1, In1, 7961 // In0))) 7962 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7963 // are essentially undef are taken from In0. 7964 InnerLoopVectorizer::VectorParts Entry(State.UF); 7965 for (unsigned In = 0; In < NumIncoming; ++In) { 7966 for (unsigned Part = 0; Part < State.UF; ++Part) { 7967 // We might have single edge PHIs (blocks) - use an identity 7968 // 'select' for the first PHI operand. 7969 Value *In0 = State.get(getIncomingValue(In), Part); 7970 if (In == 0) 7971 Entry[Part] = In0; // Initialize with the first incoming value. 7972 else { 7973 // Select between the current value and the previous incoming edge 7974 // based on the incoming mask. 7975 Value *Cond = State.get(getMask(In), Part); 7976 Entry[Part] = 7977 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7978 } 7979 } 7980 } 7981 for (unsigned Part = 0; Part < State.UF; ++Part) 7982 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7983 } 7984 7985 void VPInterleaveRecipe::execute(VPTransformState &State) { 7986 assert(!State.Instance && "Interleave group being replicated."); 7987 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7988 } 7989 7990 void VPReductionRecipe::execute(VPTransformState &State) { 7991 assert(!State.Instance && "Reduction being replicated."); 7992 for (unsigned Part = 0; Part < State.UF; ++Part) { 7993 unsigned Kind = RdxDesc->getRecurrenceKind(); 7994 Value *NewVecOp = State.get(VecOp, Part); 7995 Value *NewRed = 7996 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7997 Value *PrevInChain = State.get(ChainOp, Part); 7998 Value *NextInChain; 7999 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8000 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8001 NextInChain = 8002 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8003 NewRed, PrevInChain); 8004 } else { 8005 NextInChain = State.Builder.CreateBinOp( 8006 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8007 } 8008 State.ValueMap.setVectorValue(I, Part, NextInChain); 8009 } 8010 } 8011 8012 void VPReplicateRecipe::execute(VPTransformState &State) { 8013 if (State.Instance) { // Generate a single instance. 8014 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 8015 IsPredicated, State); 8016 // Insert scalar instance packing it into a vector. 8017 if (AlsoPack && State.VF.isVector()) { 8018 // If we're constructing lane 0, initialize to start from undef. 8019 if (State.Instance->Lane == 0) { 8020 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8021 Value *Undef = 8022 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8023 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8024 } 8025 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8026 } 8027 return; 8028 } 8029 8030 // Generate scalar instances for all VF lanes of all UF parts, unless the 8031 // instruction is uniform inwhich case generate only the first lane for each 8032 // of the UF parts. 8033 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8034 for (unsigned Part = 0; Part < State.UF; ++Part) 8035 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8036 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 8037 IsPredicated, State); 8038 } 8039 8040 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8041 assert(State.Instance && "Branch on Mask works only on single instance."); 8042 8043 unsigned Part = State.Instance->Part; 8044 unsigned Lane = State.Instance->Lane; 8045 8046 Value *ConditionBit = nullptr; 8047 VPValue *BlockInMask = getMask(); 8048 if (BlockInMask) { 8049 ConditionBit = State.get(BlockInMask, Part); 8050 if (ConditionBit->getType()->isVectorTy()) 8051 ConditionBit = State.Builder.CreateExtractElement( 8052 ConditionBit, State.Builder.getInt32(Lane)); 8053 } else // Block in mask is all-one. 8054 ConditionBit = State.Builder.getTrue(); 8055 8056 // Replace the temporary unreachable terminator with a new conditional branch, 8057 // whose two destinations will be set later when they are created. 8058 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8059 assert(isa<UnreachableInst>(CurrentTerminator) && 8060 "Expected to replace unreachable terminator with conditional branch."); 8061 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8062 CondBr->setSuccessor(0, nullptr); 8063 ReplaceInstWithInst(CurrentTerminator, CondBr); 8064 } 8065 8066 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8067 assert(State.Instance && "Predicated instruction PHI works per instance."); 8068 Instruction *ScalarPredInst = cast<Instruction>( 8069 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8070 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8071 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8072 assert(PredicatingBB && "Predicated block has no single predecessor."); 8073 8074 // By current pack/unpack logic we need to generate only a single phi node: if 8075 // a vector value for the predicated instruction exists at this point it means 8076 // the instruction has vector users only, and a phi for the vector value is 8077 // needed. In this case the recipe of the predicated instruction is marked to 8078 // also do that packing, thereby "hoisting" the insert-element sequence. 8079 // Otherwise, a phi node for the scalar value is needed. 8080 unsigned Part = State.Instance->Part; 8081 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8082 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8083 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8084 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8085 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8086 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8087 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8088 } else { 8089 Type *PredInstType = PredInst->getType(); 8090 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8091 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8092 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8093 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8094 } 8095 } 8096 8097 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8098 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8099 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8100 getMask()); 8101 } 8102 8103 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8104 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8105 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8106 // for predication. 8107 static ScalarEpilogueLowering getScalarEpilogueLowering( 8108 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8109 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8110 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8111 LoopVectorizationLegality &LVL) { 8112 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8113 // don't look at hints or options, and don't request a scalar epilogue. 8114 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8115 // LoopAccessInfo (due to code dependency and not being able to reliably get 8116 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8117 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8118 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8119 // back to the old way and vectorize with versioning when forced. See D81345.) 8120 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8121 PGSOQueryType::IRPass) && 8122 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8123 return CM_ScalarEpilogueNotAllowedOptSize; 8124 8125 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8126 !PreferPredicateOverEpilogue; 8127 8128 // 2) Next, if disabling predication is requested on the command line, honour 8129 // this and request a scalar epilogue. 8130 if (PredicateOptDisabled) 8131 return CM_ScalarEpilogueAllowed; 8132 8133 // 3) and 4) look if enabling predication is requested on the command line, 8134 // with a loop hint, or if the TTI hook indicates this is profitable, request 8135 // predication. 8136 if (PreferPredicateOverEpilogue || 8137 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8138 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8139 LVL.getLAI()) && 8140 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8141 return CM_ScalarEpilogueNotNeededUsePredicate; 8142 8143 return CM_ScalarEpilogueAllowed; 8144 } 8145 8146 // Process the loop in the VPlan-native vectorization path. This path builds 8147 // VPlan upfront in the vectorization pipeline, which allows to apply 8148 // VPlan-to-VPlan transformations from the very beginning without modifying the 8149 // input LLVM IR. 8150 static bool processLoopInVPlanNativePath( 8151 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8152 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8153 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8154 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8155 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8156 8157 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8158 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8159 return false; 8160 } 8161 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8162 Function *F = L->getHeader()->getParent(); 8163 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8164 8165 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8166 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8167 8168 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8169 &Hints, IAI); 8170 // Use the planner for outer loop vectorization. 8171 // TODO: CM is not used at this point inside the planner. Turn CM into an 8172 // optional argument if we don't need it in the future. 8173 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8174 8175 // Get user vectorization factor. 8176 const unsigned UserVF = Hints.getWidth(); 8177 8178 // Plan how to best vectorize, return the best VF and its cost. 8179 const VectorizationFactor VF = 8180 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8181 8182 // If we are stress testing VPlan builds, do not attempt to generate vector 8183 // code. Masked vector code generation support will follow soon. 8184 // Also, do not attempt to vectorize if no vector code will be produced. 8185 if (VPlanBuildStressTest || EnableVPlanPredication || 8186 VectorizationFactor::Disabled() == VF) 8187 return false; 8188 8189 LVP.setBestPlan(VF.Width, 1); 8190 8191 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8192 &CM, BFI, PSI); 8193 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8194 << L->getHeader()->getParent()->getName() << "\"\n"); 8195 LVP.executePlan(LB, DT); 8196 8197 // Mark the loop as already vectorized to avoid vectorizing again. 8198 Hints.setAlreadyVectorized(); 8199 8200 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8201 return true; 8202 } 8203 8204 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8205 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8206 !EnableLoopInterleaving), 8207 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8208 !EnableLoopVectorization) {} 8209 8210 bool LoopVectorizePass::processLoop(Loop *L) { 8211 assert((EnableVPlanNativePath || L->empty()) && 8212 "VPlan-native path is not enabled. Only process inner loops."); 8213 8214 #ifndef NDEBUG 8215 const std::string DebugLocStr = getDebugLocString(L); 8216 #endif /* NDEBUG */ 8217 8218 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8219 << L->getHeader()->getParent()->getName() << "\" from " 8220 << DebugLocStr << "\n"); 8221 8222 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8223 8224 LLVM_DEBUG( 8225 dbgs() << "LV: Loop hints:" 8226 << " force=" 8227 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8228 ? "disabled" 8229 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8230 ? "enabled" 8231 : "?")) 8232 << " width=" << Hints.getWidth() 8233 << " unroll=" << Hints.getInterleave() << "\n"); 8234 8235 // Function containing loop 8236 Function *F = L->getHeader()->getParent(); 8237 8238 // Looking at the diagnostic output is the only way to determine if a loop 8239 // was vectorized (other than looking at the IR or machine code), so it 8240 // is important to generate an optimization remark for each loop. Most of 8241 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8242 // generated as OptimizationRemark and OptimizationRemarkMissed are 8243 // less verbose reporting vectorized loops and unvectorized loops that may 8244 // benefit from vectorization, respectively. 8245 8246 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8247 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8248 return false; 8249 } 8250 8251 PredicatedScalarEvolution PSE(*SE, *L); 8252 8253 // Check if it is legal to vectorize the loop. 8254 LoopVectorizationRequirements Requirements(*ORE); 8255 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8256 &Requirements, &Hints, DB, AC, BFI, PSI); 8257 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8258 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8259 Hints.emitRemarkWithHints(); 8260 return false; 8261 } 8262 8263 // Check the function attributes and profiles to find out if this function 8264 // should be optimized for size. 8265 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8266 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8267 8268 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8269 // here. They may require CFG and instruction level transformations before 8270 // even evaluating whether vectorization is profitable. Since we cannot modify 8271 // the incoming IR, we need to build VPlan upfront in the vectorization 8272 // pipeline. 8273 if (!L->empty()) 8274 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8275 ORE, BFI, PSI, Hints); 8276 8277 assert(L->empty() && "Inner loop expected."); 8278 8279 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8280 // count by optimizing for size, to minimize overheads. 8281 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8282 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8283 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8284 << "This loop is worth vectorizing only if no scalar " 8285 << "iteration overheads are incurred."); 8286 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8287 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8288 else { 8289 LLVM_DEBUG(dbgs() << "\n"); 8290 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8291 } 8292 } 8293 8294 // Check the function attributes to see if implicit floats are allowed. 8295 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8296 // an integer loop and the vector instructions selected are purely integer 8297 // vector instructions? 8298 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8299 reportVectorizationFailure( 8300 "Can't vectorize when the NoImplicitFloat attribute is used", 8301 "loop not vectorized due to NoImplicitFloat attribute", 8302 "NoImplicitFloat", ORE, L); 8303 Hints.emitRemarkWithHints(); 8304 return false; 8305 } 8306 8307 // Check if the target supports potentially unsafe FP vectorization. 8308 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8309 // for the target we're vectorizing for, to make sure none of the 8310 // additional fp-math flags can help. 8311 if (Hints.isPotentiallyUnsafe() && 8312 TTI->isFPVectorizationPotentiallyUnsafe()) { 8313 reportVectorizationFailure( 8314 "Potentially unsafe FP op prevents vectorization", 8315 "loop not vectorized due to unsafe FP support.", 8316 "UnsafeFP", ORE, L); 8317 Hints.emitRemarkWithHints(); 8318 return false; 8319 } 8320 8321 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8322 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8323 8324 // If an override option has been passed in for interleaved accesses, use it. 8325 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8326 UseInterleaved = EnableInterleavedMemAccesses; 8327 8328 // Analyze interleaved memory accesses. 8329 if (UseInterleaved) { 8330 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8331 } 8332 8333 // Use the cost model. 8334 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8335 F, &Hints, IAI); 8336 CM.collectValuesToIgnore(); 8337 8338 // Use the planner for vectorization. 8339 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8340 8341 // Get user vectorization factor and interleave count. 8342 unsigned UserVF = Hints.getWidth(); 8343 unsigned UserIC = Hints.getInterleave(); 8344 8345 // Plan how to best vectorize, return the best VF and its cost. 8346 Optional<VectorizationFactor> MaybeVF = 8347 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8348 8349 VectorizationFactor VF = VectorizationFactor::Disabled(); 8350 unsigned IC = 1; 8351 8352 if (MaybeVF) { 8353 VF = *MaybeVF; 8354 // Select the interleave count. 8355 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8356 } 8357 8358 // Identify the diagnostic messages that should be produced. 8359 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8360 bool VectorizeLoop = true, InterleaveLoop = true; 8361 if (Requirements.doesNotMeet(F, L, Hints)) { 8362 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8363 "requirements.\n"); 8364 Hints.emitRemarkWithHints(); 8365 return false; 8366 } 8367 8368 if (VF.Width == 1) { 8369 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8370 VecDiagMsg = std::make_pair( 8371 "VectorizationNotBeneficial", 8372 "the cost-model indicates that vectorization is not beneficial"); 8373 VectorizeLoop = false; 8374 } 8375 8376 if (!MaybeVF && UserIC > 1) { 8377 // Tell the user interleaving was avoided up-front, despite being explicitly 8378 // requested. 8379 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8380 "interleaving should be avoided up front\n"); 8381 IntDiagMsg = std::make_pair( 8382 "InterleavingAvoided", 8383 "Ignoring UserIC, because interleaving was avoided up front"); 8384 InterleaveLoop = false; 8385 } else if (IC == 1 && UserIC <= 1) { 8386 // Tell the user interleaving is not beneficial. 8387 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8388 IntDiagMsg = std::make_pair( 8389 "InterleavingNotBeneficial", 8390 "the cost-model indicates that interleaving is not beneficial"); 8391 InterleaveLoop = false; 8392 if (UserIC == 1) { 8393 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8394 IntDiagMsg.second += 8395 " and is explicitly disabled or interleave count is set to 1"; 8396 } 8397 } else if (IC > 1 && UserIC == 1) { 8398 // Tell the user interleaving is beneficial, but it explicitly disabled. 8399 LLVM_DEBUG( 8400 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8401 IntDiagMsg = std::make_pair( 8402 "InterleavingBeneficialButDisabled", 8403 "the cost-model indicates that interleaving is beneficial " 8404 "but is explicitly disabled or interleave count is set to 1"); 8405 InterleaveLoop = false; 8406 } 8407 8408 // Override IC if user provided an interleave count. 8409 IC = UserIC > 0 ? UserIC : IC; 8410 8411 // Emit diagnostic messages, if any. 8412 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8413 if (!VectorizeLoop && !InterleaveLoop) { 8414 // Do not vectorize or interleaving the loop. 8415 ORE->emit([&]() { 8416 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8417 L->getStartLoc(), L->getHeader()) 8418 << VecDiagMsg.second; 8419 }); 8420 ORE->emit([&]() { 8421 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8422 L->getStartLoc(), L->getHeader()) 8423 << IntDiagMsg.second; 8424 }); 8425 return false; 8426 } else if (!VectorizeLoop && InterleaveLoop) { 8427 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8428 ORE->emit([&]() { 8429 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8430 L->getStartLoc(), L->getHeader()) 8431 << VecDiagMsg.second; 8432 }); 8433 } else if (VectorizeLoop && !InterleaveLoop) { 8434 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8435 << ") in " << DebugLocStr << '\n'); 8436 ORE->emit([&]() { 8437 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8438 L->getStartLoc(), L->getHeader()) 8439 << IntDiagMsg.second; 8440 }); 8441 } else if (VectorizeLoop && InterleaveLoop) { 8442 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8443 << ") in " << DebugLocStr << '\n'); 8444 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8445 } 8446 8447 LVP.setBestPlan(VF.Width, IC); 8448 8449 using namespace ore; 8450 bool DisableRuntimeUnroll = false; 8451 MDNode *OrigLoopID = L->getLoopID(); 8452 8453 if (!VectorizeLoop) { 8454 assert(IC > 1 && "interleave count should not be 1 or 0"); 8455 // If we decided that it is not legal to vectorize the loop, then 8456 // interleave it. 8457 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8458 BFI, PSI); 8459 LVP.executePlan(Unroller, DT); 8460 8461 ORE->emit([&]() { 8462 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8463 L->getHeader()) 8464 << "interleaved loop (interleaved count: " 8465 << NV("InterleaveCount", IC) << ")"; 8466 }); 8467 } else { 8468 // If we decided that it is *legal* to vectorize the loop, then do it. 8469 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8470 &LVL, &CM, BFI, PSI); 8471 LVP.executePlan(LB, DT); 8472 ++LoopsVectorized; 8473 8474 // Add metadata to disable runtime unrolling a scalar loop when there are 8475 // no runtime checks about strides and memory. A scalar loop that is 8476 // rarely used is not worth unrolling. 8477 if (!LB.areSafetyChecksAdded()) 8478 DisableRuntimeUnroll = true; 8479 8480 // Report the vectorization decision. 8481 ORE->emit([&]() { 8482 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8483 L->getHeader()) 8484 << "vectorized loop (vectorization width: " 8485 << NV("VectorizationFactor", VF.Width) 8486 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8487 }); 8488 } 8489 8490 Optional<MDNode *> RemainderLoopID = 8491 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8492 LLVMLoopVectorizeFollowupEpilogue}); 8493 if (RemainderLoopID.hasValue()) { 8494 L->setLoopID(RemainderLoopID.getValue()); 8495 } else { 8496 if (DisableRuntimeUnroll) 8497 AddRuntimeUnrollDisableMetaData(L); 8498 8499 // Mark the loop as already vectorized to avoid vectorizing again. 8500 Hints.setAlreadyVectorized(); 8501 } 8502 8503 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8504 return true; 8505 } 8506 8507 LoopVectorizeResult LoopVectorizePass::runImpl( 8508 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8509 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8510 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8511 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8512 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8513 SE = &SE_; 8514 LI = &LI_; 8515 TTI = &TTI_; 8516 DT = &DT_; 8517 BFI = &BFI_; 8518 TLI = TLI_; 8519 AA = &AA_; 8520 AC = &AC_; 8521 GetLAA = &GetLAA_; 8522 DB = &DB_; 8523 ORE = &ORE_; 8524 PSI = PSI_; 8525 8526 // Don't attempt if 8527 // 1. the target claims to have no vector registers, and 8528 // 2. interleaving won't help ILP. 8529 // 8530 // The second condition is necessary because, even if the target has no 8531 // vector registers, loop vectorization may still enable scalar 8532 // interleaving. 8533 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8534 TTI->getMaxInterleaveFactor(1) < 2) 8535 return LoopVectorizeResult(false, false); 8536 8537 bool Changed = false, CFGChanged = false; 8538 8539 // The vectorizer requires loops to be in simplified form. 8540 // Since simplification may add new inner loops, it has to run before the 8541 // legality and profitability checks. This means running the loop vectorizer 8542 // will simplify all loops, regardless of whether anything end up being 8543 // vectorized. 8544 for (auto &L : *LI) 8545 Changed |= CFGChanged |= 8546 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8547 8548 // Build up a worklist of inner-loops to vectorize. This is necessary as 8549 // the act of vectorizing or partially unrolling a loop creates new loops 8550 // and can invalidate iterators across the loops. 8551 SmallVector<Loop *, 8> Worklist; 8552 8553 for (Loop *L : *LI) 8554 collectSupportedLoops(*L, LI, ORE, Worklist); 8555 8556 LoopsAnalyzed += Worklist.size(); 8557 8558 // Now walk the identified inner loops. 8559 while (!Worklist.empty()) { 8560 Loop *L = Worklist.pop_back_val(); 8561 8562 // For the inner loops we actually process, form LCSSA to simplify the 8563 // transform. 8564 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8565 8566 Changed |= CFGChanged |= processLoop(L); 8567 } 8568 8569 // Process each loop nest in the function. 8570 return LoopVectorizeResult(Changed, CFGChanged); 8571 } 8572 8573 PreservedAnalyses LoopVectorizePass::run(Function &F, 8574 FunctionAnalysisManager &AM) { 8575 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8576 auto &LI = AM.getResult<LoopAnalysis>(F); 8577 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8578 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8579 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8580 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8581 auto &AA = AM.getResult<AAManager>(F); 8582 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8583 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8584 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8585 MemorySSA *MSSA = EnableMSSALoopDependency 8586 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8587 : nullptr; 8588 8589 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8590 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8591 [&](Loop &L) -> const LoopAccessInfo & { 8592 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8593 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8594 }; 8595 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8596 ProfileSummaryInfo *PSI = 8597 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8598 LoopVectorizeResult Result = 8599 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8600 if (!Result.MadeAnyChange) 8601 return PreservedAnalyses::all(); 8602 PreservedAnalyses PA; 8603 8604 // We currently do not preserve loopinfo/dominator analyses with outer loop 8605 // vectorization. Until this is addressed, mark these analyses as preserved 8606 // only for non-VPlan-native path. 8607 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8608 if (!EnableVPlanNativePath) { 8609 PA.preserve<LoopAnalysis>(); 8610 PA.preserve<DominatorTreeAnalysis>(); 8611 } 8612 PA.preserve<BasicAA>(); 8613 PA.preserve<GlobalsAA>(); 8614 if (!Result.MadeCFGChange) 8615 PA.preserveSet<CFGAnalyses>(); 8616 return PA; 8617 } 8618