1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } // namespace PreferPredicateTy 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// Interleave small loops with scalar reductions. 274 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 276 cl::desc("Enable interleaving for loops with small iteration counts that " 277 "contain scalar reductions to expose ILP.")); 278 279 /// The number of stores in a loop that are allowed to need predication. 280 static cl::opt<unsigned> NumberOfStoresToPredicate( 281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 282 cl::desc("Max number of stores to be predicated behind an if.")); 283 284 static cl::opt<bool> EnableIndVarRegisterHeur( 285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 286 cl::desc("Count the induction variable only once when interleaving")); 287 288 static cl::opt<bool> EnableCondStoresVectorization( 289 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 290 cl::desc("Enable if predication of stores during vectorization.")); 291 292 static cl::opt<unsigned> MaxNestedScalarReductionIC( 293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 294 cl::desc("The maximum interleave count to use when interleaving a scalar " 295 "reduction in a nested loop.")); 296 297 static cl::opt<bool> 298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 299 cl::Hidden, 300 cl::desc("Prefer in-loop vector reductions, " 301 "overriding the targets preference.")); 302 303 static cl::opt<bool> PreferPredicatedReductionSelect( 304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 305 cl::desc( 306 "Prefer predicating a reduction operation over an after loop select.")); 307 308 cl::opt<bool> EnableVPlanNativePath( 309 "enable-vplan-native-path", cl::init(false), cl::Hidden, 310 cl::desc("Enable VPlan-native vectorization path with " 311 "support for outer loop vectorization.")); 312 313 // FIXME: Remove this switch once we have divergence analysis. Currently we 314 // assume divergent non-backedge branches when this switch is true. 315 cl::opt<bool> EnableVPlanPredication( 316 "enable-vplan-predication", cl::init(false), cl::Hidden, 317 cl::desc("Enable VPlan-native vectorization path predicator with " 318 "support for outer loop vectorization.")); 319 320 // This flag enables the stress testing of the VPlan H-CFG construction in the 321 // VPlan-native vectorization path. It must be used in conjuction with 322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 323 // verification of the H-CFGs built. 324 static cl::opt<bool> VPlanBuildStressTest( 325 "vplan-build-stress-test", cl::init(false), cl::Hidden, 326 cl::desc( 327 "Build VPlan for every supported loop nest in the function and bail " 328 "out right after the build (stress test the VPlan H-CFG construction " 329 "in the VPlan-native vectorization path).")); 330 331 cl::opt<bool> llvm::EnableLoopInterleaving( 332 "interleave-loops", cl::init(true), cl::Hidden, 333 cl::desc("Enable loop interleaving in Loop vectorization passes")); 334 cl::opt<bool> llvm::EnableLoopVectorization( 335 "vectorize-loops", cl::init(true), cl::Hidden, 336 cl::desc("Run the Loop vectorization passes")); 337 338 /// A helper function that returns the type of loaded or stored value. 339 static Type *getMemInstValueType(Value *I) { 340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 341 "Expected Load or Store instruction"); 342 if (auto *LI = dyn_cast<LoadInst>(I)) 343 return LI->getType(); 344 return cast<StoreInst>(I)->getValueOperand()->getType(); 345 } 346 347 /// A helper function that returns true if the given type is irregular. The 348 /// type is irregular if its allocated size doesn't equal the store size of an 349 /// element of the corresponding vector type at the given vectorization factor. 350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 351 assert(!VF.isScalable() && "scalable vectors not yet supported."); 352 // Determine if an array of VF elements of type Ty is "bitcast compatible" 353 // with a <VF x Ty> vector. 354 if (VF.isVector()) { 355 auto *VectorTy = VectorType::get(Ty, VF); 356 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 357 } 358 359 // If the vectorization factor is one, we just check if an array of type Ty 360 // requires padding between elements. 361 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 362 } 363 364 /// A helper function that returns the reciprocal of the block probability of 365 /// predicated blocks. If we return X, we are assuming the predicated block 366 /// will execute once for every X iterations of the loop header. 367 /// 368 /// TODO: We should use actual block probability here, if available. Currently, 369 /// we always assume predicated blocks have a 50% chance of executing. 370 static unsigned getReciprocalPredBlockProb() { return 2; } 371 372 /// A helper function that adds a 'fast' flag to floating-point operations. 373 static Value *addFastMathFlag(Value *V) { 374 if (isa<FPMathOperator>(V)) 375 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 376 return V; 377 } 378 379 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 380 if (isa<FPMathOperator>(V)) 381 cast<Instruction>(V)->setFastMathFlags(FMF); 382 return V; 383 } 384 385 /// A helper function that returns an integer or floating-point constant with 386 /// value C. 387 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 388 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 389 : ConstantFP::get(Ty, C); 390 } 391 392 /// Returns "best known" trip count for the specified loop \p L as defined by 393 /// the following procedure: 394 /// 1) Returns exact trip count if it is known. 395 /// 2) Returns expected trip count according to profile data if any. 396 /// 3) Returns upper bound estimate if it is known. 397 /// 4) Returns None if all of the above failed. 398 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 399 // Check if exact trip count is known. 400 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 401 return ExpectedTC; 402 403 // Check if there is an expected trip count available from profile data. 404 if (LoopVectorizeWithBlockFrequency) 405 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 406 return EstimatedTC; 407 408 // Check if upper bound estimate is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 410 return ExpectedTC; 411 412 return None; 413 } 414 415 namespace llvm { 416 417 /// InnerLoopVectorizer vectorizes loops which contain only one basic 418 /// block to a specified vectorization factor (VF). 419 /// This class performs the widening of scalars into vectors, or multiple 420 /// scalars. This class also implements the following features: 421 /// * It inserts an epilogue loop for handling loops that don't have iteration 422 /// counts that are known to be a multiple of the vectorization factor. 423 /// * It handles the code generation for reduction variables. 424 /// * Scalarization (implementation using scalars) of un-vectorizable 425 /// instructions. 426 /// InnerLoopVectorizer does not perform any vectorization-legality 427 /// checks, and relies on the caller to check for the different legality 428 /// aspects. The InnerLoopVectorizer relies on the 429 /// LoopVectorizationLegality class to provide information about the induction 430 /// and reduction variables that were found to a given vectorization factor. 431 class InnerLoopVectorizer { 432 public: 433 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 434 LoopInfo *LI, DominatorTree *DT, 435 const TargetLibraryInfo *TLI, 436 const TargetTransformInfo *TTI, AssumptionCache *AC, 437 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 438 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 439 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 440 ProfileSummaryInfo *PSI) 441 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 442 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 443 Builder(PSE.getSE()->getContext()), 444 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 445 BFI(BFI), PSI(PSI) { 446 // Query this against the original loop and save it here because the profile 447 // of the original loop header may change as the transformation happens. 448 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 449 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 450 } 451 452 virtual ~InnerLoopVectorizer() = default; 453 454 /// Create a new empty loop that will contain vectorized instructions later 455 /// on, while the old loop will be used as the scalar remainder. Control flow 456 /// is generated around the vectorized (and scalar epilogue) loops consisting 457 /// of various checks and bypasses. Return the pre-header block of the new 458 /// loop. 459 BasicBlock *createVectorizedLoopSkeleton(); 460 461 /// Widen a single instruction within the innermost loop. 462 void widenInstruction(Instruction &I, VPUser &Operands, 463 VPTransformState &State); 464 465 /// Widen a single call instruction within the innermost loop. 466 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 467 VPTransformState &State); 468 469 /// Widen a single select instruction within the innermost loop. 470 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 471 bool InvariantCond, VPTransformState &State); 472 473 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 474 void fixVectorizedLoop(); 475 476 // Return true if any runtime check is added. 477 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 478 479 /// A type for vectorized values in the new loop. Each value from the 480 /// original loop, when vectorized, is represented by UF vector values in the 481 /// new unrolled loop, where UF is the unroll factor. 482 using VectorParts = SmallVector<Value *, 2>; 483 484 /// Vectorize a single GetElementPtrInst based on information gathered and 485 /// decisions taken during planning. 486 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 487 ElementCount VF, bool IsPtrLoopInvariant, 488 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 489 490 /// Vectorize a single PHINode in a block. This method handles the induction 491 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 492 /// arbitrary length vectors. 493 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 494 495 /// A helper function to scalarize a single Instruction in the innermost loop. 496 /// Generates a sequence of scalar instances for each lane between \p MinLane 497 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 498 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 499 /// Instr's operands. 500 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 501 const VPIteration &Instance, bool IfPredicateInstr, 502 VPTransformState &State); 503 504 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 505 /// is provided, the integer induction variable will first be truncated to 506 /// the corresponding type. 507 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 508 509 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 510 /// vector or scalar value on-demand if one is not yet available. When 511 /// vectorizing a loop, we visit the definition of an instruction before its 512 /// uses. When visiting the definition, we either vectorize or scalarize the 513 /// instruction, creating an entry for it in the corresponding map. (In some 514 /// cases, such as induction variables, we will create both vector and scalar 515 /// entries.) Then, as we encounter uses of the definition, we derive values 516 /// for each scalar or vector use unless such a value is already available. 517 /// For example, if we scalarize a definition and one of its uses is vector, 518 /// we build the required vector on-demand with an insertelement sequence 519 /// when visiting the use. Otherwise, if the use is scalar, we can use the 520 /// existing scalar definition. 521 /// 522 /// Return a value in the new loop corresponding to \p V from the original 523 /// loop at unroll index \p Part. If the value has already been vectorized, 524 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 525 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 526 /// a new vector value on-demand by inserting the scalar values into a vector 527 /// with an insertelement sequence. If the value has been neither vectorized 528 /// nor scalarized, it must be loop invariant, so we simply broadcast the 529 /// value into a vector. 530 Value *getOrCreateVectorValue(Value *V, unsigned Part); 531 532 /// Return a value in the new loop corresponding to \p V from the original 533 /// loop at unroll and vector indices \p Instance. If the value has been 534 /// vectorized but not scalarized, the necessary extractelement instruction 535 /// will be generated. 536 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 537 538 /// Construct the vector value of a scalarized value \p V one lane at a time. 539 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 540 541 /// Try to vectorize interleaved access group \p Group with the base address 542 /// given in \p Addr, optionally masking the vector operations if \p 543 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 544 /// values in the vectorized loop. 545 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 546 VPTransformState &State, VPValue *Addr, 547 VPValue *BlockInMask = nullptr); 548 549 /// Vectorize Load and Store instructions with the base address given in \p 550 /// Addr, optionally masking the vector operations if \p BlockInMask is 551 /// non-null. Use \p State to translate given VPValues to IR values in the 552 /// vectorized loop. 553 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 554 VPValue *Addr, VPValue *StoredValue, 555 VPValue *BlockInMask); 556 557 /// Set the debug location in the builder using the debug location in 558 /// the instruction. 559 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 560 561 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 562 void fixNonInductionPHIs(void); 563 564 protected: 565 friend class LoopVectorizationPlanner; 566 567 /// A small list of PHINodes. 568 using PhiVector = SmallVector<PHINode *, 4>; 569 570 /// A type for scalarized values in the new loop. Each value from the 571 /// original loop, when scalarized, is represented by UF x VF scalar values 572 /// in the new unrolled loop, where UF is the unroll factor and VF is the 573 /// vectorization factor. 574 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 575 576 /// Set up the values of the IVs correctly when exiting the vector loop. 577 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 578 Value *CountRoundDown, Value *EndValue, 579 BasicBlock *MiddleBlock); 580 581 /// Create a new induction variable inside L. 582 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 583 Value *Step, Instruction *DL); 584 585 /// Handle all cross-iteration phis in the header. 586 void fixCrossIterationPHIs(); 587 588 /// Fix a first-order recurrence. This is the second phase of vectorizing 589 /// this phi node. 590 void fixFirstOrderRecurrence(PHINode *Phi); 591 592 /// Fix a reduction cross-iteration phi. This is the second phase of 593 /// vectorizing this phi node. 594 void fixReduction(PHINode *Phi); 595 596 /// Clear NSW/NUW flags from reduction instructions if necessary. 597 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 598 599 /// The Loop exit block may have single value PHI nodes with some 600 /// incoming value. While vectorizing we only handled real values 601 /// that were defined inside the loop and we should have one value for 602 /// each predecessor of its parent basic block. See PR14725. 603 void fixLCSSAPHIs(); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(); 612 613 /// Create a broadcast instruction. This method generates a broadcast 614 /// instruction (shuffle) for loop invariant values and for the induction 615 /// value. If this is the induction variable then we extend it to N, N+1, ... 616 /// this is needed because each iteration in the loop corresponds to a SIMD 617 /// element. 618 virtual Value *getBroadcastInstrs(Value *V); 619 620 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 624 Instruction::BinaryOps Opcode = 625 Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Instruction *EntryVal); 642 643 /// Returns true if an instruction \p I should be scalarized instead of 644 /// vectorized for the chosen vectorization factor. 645 bool shouldScalarizeInstruction(Instruction *I) const; 646 647 /// Returns true if we should generate a scalar version of \p IV. 648 bool needsScalarInduction(Instruction *IV) const; 649 650 /// If there is a cast involved in the induction variable \p ID, which should 651 /// be ignored in the vectorized loop body, this function records the 652 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 653 /// cast. We had already proved that the casted Phi is equal to the uncasted 654 /// Phi in the vectorized loop (under a runtime guard), and therefore 655 /// there is no need to vectorize the cast - the same value can be used in the 656 /// vector loop for both the Phi and the cast. 657 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 658 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 659 /// 660 /// \p EntryVal is the value from the original loop that maps to the vector 661 /// phi node and is used to distinguish what is the IV currently being 662 /// processed - original one (if \p EntryVal is a phi corresponding to the 663 /// original IV) or the "newly-created" one based on the proof mentioned above 664 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 665 /// latter case \p EntryVal is a TruncInst and we must not record anything for 666 /// that IV, but it's error-prone to expect callers of this routine to care 667 /// about that, hence this explicit parameter. 668 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 669 const Instruction *EntryVal, 670 Value *VectorLoopValue, 671 unsigned Part, 672 unsigned Lane = UINT_MAX); 673 674 /// Generate a shuffle sequence that will reverse the vector Vec. 675 virtual Value *reverseVector(Value *Vec); 676 677 /// Returns (and creates if needed) the original loop trip count. 678 Value *getOrCreateTripCount(Loop *NewLoop); 679 680 /// Returns (and creates if needed) the trip count of the widened loop. 681 Value *getOrCreateVectorTripCount(Loop *NewLoop); 682 683 /// Returns a bitcasted value to the requested vector type. 684 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 685 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 686 const DataLayout &DL); 687 688 /// Emit a bypass check to see if the vector trip count is zero, including if 689 /// it overflows. 690 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 691 692 /// Emit a bypass check to see if all of the SCEV assumptions we've 693 /// had to make are correct. 694 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 695 696 /// Emit bypass checks to check any memory assumptions we may have made. 697 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 718 719 /// Complete the loop skeleton by adding debug MDs, creating appropriate 720 /// conditional branches in the middle block, preparing the builder and 721 /// running the verifier. Take in the vector loop \p L as argument, and return 722 /// the preheader of the completed vector loop. 723 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 724 725 /// Add additional metadata to \p To that was not present on \p Orig. 726 /// 727 /// Currently this is used to add the noalias annotations based on the 728 /// inserted memchecks. Use this for instructions that are *cloned* into the 729 /// vector loop. 730 void addNewMetadata(Instruction *To, const Instruction *Orig); 731 732 /// Add metadata from one instruction to another. 733 /// 734 /// This includes both the original MDs from \p From and additional ones (\see 735 /// addNewMetadata). Use this for *newly created* instructions in the vector 736 /// loop. 737 void addMetadata(Instruction *To, Instruction *From); 738 739 /// Similar to the previous function but it adds the metadata to a 740 /// vector of instructions. 741 void addMetadata(ArrayRef<Value *> To, Instruction *From); 742 743 /// The original loop. 744 Loop *OrigLoop; 745 746 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 747 /// dynamic knowledge to simplify SCEV expressions and converts them to a 748 /// more usable form. 749 PredicatedScalarEvolution &PSE; 750 751 /// Loop Info. 752 LoopInfo *LI; 753 754 /// Dominator Tree. 755 DominatorTree *DT; 756 757 /// Alias Analysis. 758 AAResults *AA; 759 760 /// Target Library Info. 761 const TargetLibraryInfo *TLI; 762 763 /// Target Transform Info. 764 const TargetTransformInfo *TTI; 765 766 /// Assumption Cache. 767 AssumptionCache *AC; 768 769 /// Interface to emit optimization remarks. 770 OptimizationRemarkEmitter *ORE; 771 772 /// LoopVersioning. It's only set up (non-null) if memchecks were 773 /// used. 774 /// 775 /// This is currently only used to add no-alias metadata based on the 776 /// memchecks. The actually versioning is performed manually. 777 std::unique_ptr<LoopVersioning> LVer; 778 779 /// The vectorization SIMD factor to use. Each vector will have this many 780 /// vector elements. 781 ElementCount VF; 782 783 /// The vectorization unroll factor to use. Each scalar is vectorized to this 784 /// many different vector instructions. 785 unsigned UF; 786 787 /// The builder that we use 788 IRBuilder<> Builder; 789 790 // --- Vectorization state --- 791 792 /// The vector-loop preheader. 793 BasicBlock *LoopVectorPreHeader; 794 795 /// The scalar-loop preheader. 796 BasicBlock *LoopScalarPreHeader; 797 798 /// Middle Block between the vector and the scalar. 799 BasicBlock *LoopMiddleBlock; 800 801 /// The ExitBlock of the scalar loop. 802 BasicBlock *LoopExitBlock; 803 804 /// The vector loop body. 805 BasicBlock *LoopVectorBody; 806 807 /// The scalar loop body. 808 BasicBlock *LoopScalarBody; 809 810 /// A list of all bypass blocks. The first block is the entry of the loop. 811 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 812 813 /// The new Induction variable which was added to the new block. 814 PHINode *Induction = nullptr; 815 816 /// The induction variable of the old basic block. 817 PHINode *OldInduction = nullptr; 818 819 /// Maps values from the original loop to their corresponding values in the 820 /// vectorized loop. A key value can map to either vector values, scalar 821 /// values or both kinds of values, depending on whether the key was 822 /// vectorized and scalarized. 823 VectorizerValueMap VectorLoopValueMap; 824 825 /// Store instructions that were predicated. 826 SmallVector<Instruction *, 4> PredicatedInstructions; 827 828 /// Trip count of the original loop. 829 Value *TripCount = nullptr; 830 831 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 832 Value *VectorTripCount = nullptr; 833 834 /// The legality analysis. 835 LoopVectorizationLegality *Legal; 836 837 /// The profitablity analysis. 838 LoopVectorizationCostModel *Cost; 839 840 // Record whether runtime checks are added. 841 bool AddedSafetyChecks = false; 842 843 // Holds the end values for each induction variable. We save the end values 844 // so we can later fix-up the external users of the induction variables. 845 DenseMap<PHINode *, Value *> IVEndValues; 846 847 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 848 // fixed up at the end of vector code generation. 849 SmallVector<PHINode *, 8> OrigPHIsToFix; 850 851 /// BFI and PSI are used to check for profile guided size optimizations. 852 BlockFrequencyInfo *BFI; 853 ProfileSummaryInfo *PSI; 854 855 // Whether this loop should be optimized for size based on profile guided size 856 // optimizatios. 857 bool OptForSizeBasedOnProfile; 858 }; 859 860 class InnerLoopUnroller : public InnerLoopVectorizer { 861 public: 862 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 863 LoopInfo *LI, DominatorTree *DT, 864 const TargetLibraryInfo *TLI, 865 const TargetTransformInfo *TTI, AssumptionCache *AC, 866 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 867 LoopVectorizationLegality *LVL, 868 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 869 ProfileSummaryInfo *PSI) 870 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 871 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 872 BFI, PSI) {} 873 874 private: 875 Value *getBroadcastInstrs(Value *V) override; 876 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 877 Instruction::BinaryOps Opcode = 878 Instruction::BinaryOpsEnd) override; 879 Value *reverseVector(Value *Vec) override; 880 }; 881 882 } // end namespace llvm 883 884 /// Look for a meaningful debug location on the instruction or it's 885 /// operands. 886 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 887 if (!I) 888 return I; 889 890 DebugLoc Empty; 891 if (I->getDebugLoc() != Empty) 892 return I; 893 894 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 895 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 896 if (OpInst->getDebugLoc() != Empty) 897 return OpInst; 898 } 899 900 return I; 901 } 902 903 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 904 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 905 const DILocation *DIL = Inst->getDebugLoc(); 906 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 907 !isa<DbgInfoIntrinsic>(Inst)) { 908 assert(!VF.isScalable() && "scalable vectors not yet supported."); 909 auto NewDIL = 910 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 911 if (NewDIL) 912 B.SetCurrentDebugLocation(NewDIL.getValue()); 913 else 914 LLVM_DEBUG(dbgs() 915 << "Failed to create new discriminator: " 916 << DIL->getFilename() << " Line: " << DIL->getLine()); 917 } 918 else 919 B.SetCurrentDebugLocation(DIL); 920 } else 921 B.SetCurrentDebugLocation(DebugLoc()); 922 } 923 924 /// Write a record \p DebugMsg about vectorization failure to the debug 925 /// output stream. If \p I is passed, it is an instruction that prevents 926 /// vectorization. 927 #ifndef NDEBUG 928 static void debugVectorizationFailure(const StringRef DebugMsg, 929 Instruction *I) { 930 dbgs() << "LV: Not vectorizing: " << DebugMsg; 931 if (I != nullptr) 932 dbgs() << " " << *I; 933 else 934 dbgs() << '.'; 935 dbgs() << '\n'; 936 } 937 #endif 938 939 /// Create an analysis remark that explains why vectorization failed 940 /// 941 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 942 /// RemarkName is the identifier for the remark. If \p I is passed it is an 943 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 944 /// the location of the remark. \return the remark object that can be 945 /// streamed to. 946 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 947 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 948 Value *CodeRegion = TheLoop->getHeader(); 949 DebugLoc DL = TheLoop->getStartLoc(); 950 951 if (I) { 952 CodeRegion = I->getParent(); 953 // If there is no debug location attached to the instruction, revert back to 954 // using the loop's. 955 if (I->getDebugLoc()) 956 DL = I->getDebugLoc(); 957 } 958 959 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 960 R << "loop not vectorized: "; 961 return R; 962 } 963 964 namespace llvm { 965 966 void reportVectorizationFailure(const StringRef DebugMsg, 967 const StringRef OREMsg, const StringRef ORETag, 968 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 969 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 970 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 971 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 972 ORETag, TheLoop, I) << OREMsg); 973 } 974 975 } // end namespace llvm 976 977 #ifndef NDEBUG 978 /// \return string containing a file name and a line # for the given loop. 979 static std::string getDebugLocString(const Loop *L) { 980 std::string Result; 981 if (L) { 982 raw_string_ostream OS(Result); 983 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 984 LoopDbgLoc.print(OS); 985 else 986 // Just print the module name. 987 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 988 OS.flush(); 989 } 990 return Result; 991 } 992 #endif 993 994 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 995 const Instruction *Orig) { 996 // If the loop was versioned with memchecks, add the corresponding no-alias 997 // metadata. 998 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 999 LVer->annotateInstWithNoAlias(To, Orig); 1000 } 1001 1002 void InnerLoopVectorizer::addMetadata(Instruction *To, 1003 Instruction *From) { 1004 propagateMetadata(To, From); 1005 addNewMetadata(To, From); 1006 } 1007 1008 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1009 Instruction *From) { 1010 for (Value *V : To) { 1011 if (Instruction *I = dyn_cast<Instruction>(V)) 1012 addMetadata(I, From); 1013 } 1014 } 1015 1016 namespace llvm { 1017 1018 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1019 // lowered. 1020 enum ScalarEpilogueLowering { 1021 1022 // The default: allowing scalar epilogues. 1023 CM_ScalarEpilogueAllowed, 1024 1025 // Vectorization with OptForSize: don't allow epilogues. 1026 CM_ScalarEpilogueNotAllowedOptSize, 1027 1028 // A special case of vectorisation with OptForSize: loops with a very small 1029 // trip count are considered for vectorization under OptForSize, thereby 1030 // making sure the cost of their loop body is dominant, free of runtime 1031 // guards and scalar iteration overheads. 1032 CM_ScalarEpilogueNotAllowedLowTripLoop, 1033 1034 // Loop hint predicate indicating an epilogue is undesired. 1035 CM_ScalarEpilogueNotNeededUsePredicate 1036 }; 1037 1038 /// LoopVectorizationCostModel - estimates the expected speedups due to 1039 /// vectorization. 1040 /// In many cases vectorization is not profitable. This can happen because of 1041 /// a number of reasons. In this class we mainly attempt to predict the 1042 /// expected speedup/slowdowns due to the supported instruction set. We use the 1043 /// TargetTransformInfo to query the different backends for the cost of 1044 /// different operations. 1045 class LoopVectorizationCostModel { 1046 public: 1047 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1048 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1049 LoopVectorizationLegality *Legal, 1050 const TargetTransformInfo &TTI, 1051 const TargetLibraryInfo *TLI, DemandedBits *DB, 1052 AssumptionCache *AC, 1053 OptimizationRemarkEmitter *ORE, const Function *F, 1054 const LoopVectorizeHints *Hints, 1055 InterleavedAccessInfo &IAI) 1056 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1057 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1058 Hints(Hints), InterleaveInfo(IAI) {} 1059 1060 /// \return An upper bound for the vectorization factor, or None if 1061 /// vectorization and interleaving should be avoided up front. 1062 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1063 1064 /// \return True if runtime checks are required for vectorization, and false 1065 /// otherwise. 1066 bool runtimeChecksRequired(); 1067 1068 /// \return The most profitable vectorization factor and the cost of that VF. 1069 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1070 /// then this vectorization factor will be selected if vectorization is 1071 /// possible. 1072 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1073 1074 /// Setup cost-based decisions for user vectorization factor. 1075 void selectUserVectorizationFactor(ElementCount UserVF) { 1076 collectUniformsAndScalars(UserVF); 1077 collectInstsToScalarize(UserVF); 1078 } 1079 1080 /// \return The size (in bits) of the smallest and widest types in the code 1081 /// that needs to be vectorized. We ignore values that remain scalar such as 1082 /// 64 bit loop indices. 1083 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1084 1085 /// \return The desired interleave count. 1086 /// If interleave count has been specified by metadata it will be returned. 1087 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1088 /// are the selected vectorization factor and the cost of the selected VF. 1089 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1090 1091 /// Memory access instruction may be vectorized in more than one way. 1092 /// Form of instruction after vectorization depends on cost. 1093 /// This function takes cost-based decisions for Load/Store instructions 1094 /// and collects them in a map. This decisions map is used for building 1095 /// the lists of loop-uniform and loop-scalar instructions. 1096 /// The calculated cost is saved with widening decision in order to 1097 /// avoid redundant calculations. 1098 void setCostBasedWideningDecision(ElementCount VF); 1099 1100 /// A struct that represents some properties of the register usage 1101 /// of a loop. 1102 struct RegisterUsage { 1103 /// Holds the number of loop invariant values that are used in the loop. 1104 /// The key is ClassID of target-provided register class. 1105 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1106 /// Holds the maximum number of concurrent live intervals in the loop. 1107 /// The key is ClassID of target-provided register class. 1108 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1109 }; 1110 1111 /// \return Returns information about the register usages of the loop for the 1112 /// given vectorization factors. 1113 SmallVector<RegisterUsage, 8> 1114 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1115 1116 /// Collect values we want to ignore in the cost model. 1117 void collectValuesToIgnore(); 1118 1119 /// Split reductions into those that happen in the loop, and those that happen 1120 /// outside. In loop reductions are collected into InLoopReductionChains. 1121 void collectInLoopReductions(); 1122 1123 /// \returns The smallest bitwidth each instruction can be represented with. 1124 /// The vector equivalents of these instructions should be truncated to this 1125 /// type. 1126 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1127 return MinBWs; 1128 } 1129 1130 /// \returns True if it is more profitable to scalarize instruction \p I for 1131 /// vectorization factor \p VF. 1132 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1133 assert(VF.isVector() && 1134 "Profitable to scalarize relevant only for VF > 1."); 1135 1136 // Cost model is not run in the VPlan-native path - return conservative 1137 // result until this changes. 1138 if (EnableVPlanNativePath) 1139 return false; 1140 1141 auto Scalars = InstsToScalarize.find(VF); 1142 assert(Scalars != InstsToScalarize.end() && 1143 "VF not yet analyzed for scalarization profitability"); 1144 return Scalars->second.find(I) != Scalars->second.end(); 1145 } 1146 1147 /// Returns true if \p I is known to be uniform after vectorization. 1148 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1149 if (VF.isScalar()) 1150 return true; 1151 1152 // Cost model is not run in the VPlan-native path - return conservative 1153 // result until this changes. 1154 if (EnableVPlanNativePath) 1155 return false; 1156 1157 auto UniformsPerVF = Uniforms.find(VF); 1158 assert(UniformsPerVF != Uniforms.end() && 1159 "VF not yet analyzed for uniformity"); 1160 return UniformsPerVF->second.count(I); 1161 } 1162 1163 /// Returns true if \p I is known to be scalar after vectorization. 1164 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1165 if (VF.isScalar()) 1166 return true; 1167 1168 // Cost model is not run in the VPlan-native path - return conservative 1169 // result until this changes. 1170 if (EnableVPlanNativePath) 1171 return false; 1172 1173 auto ScalarsPerVF = Scalars.find(VF); 1174 assert(ScalarsPerVF != Scalars.end() && 1175 "Scalar values are not calculated for VF"); 1176 return ScalarsPerVF->second.count(I); 1177 } 1178 1179 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1180 /// for vectorization factor \p VF. 1181 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1182 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1183 !isProfitableToScalarize(I, VF) && 1184 !isScalarAfterVectorization(I, VF); 1185 } 1186 1187 /// Decision that was taken during cost calculation for memory instruction. 1188 enum InstWidening { 1189 CM_Unknown, 1190 CM_Widen, // For consecutive accesses with stride +1. 1191 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1192 CM_Interleave, 1193 CM_GatherScatter, 1194 CM_Scalarize 1195 }; 1196 1197 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1198 /// instruction \p I and vector width \p VF. 1199 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1200 unsigned Cost) { 1201 assert(VF.isVector() && "Expected VF >=2"); 1202 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1203 } 1204 1205 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1206 /// interleaving group \p Grp and vector width \p VF. 1207 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1208 ElementCount VF, InstWidening W, unsigned Cost) { 1209 assert(VF.isVector() && "Expected VF >=2"); 1210 /// Broadcast this decicion to all instructions inside the group. 1211 /// But the cost will be assigned to one instruction only. 1212 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1213 if (auto *I = Grp->getMember(i)) { 1214 if (Grp->getInsertPos() == I) 1215 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1216 else 1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1218 } 1219 } 1220 } 1221 1222 /// Return the cost model decision for the given instruction \p I and vector 1223 /// width \p VF. Return CM_Unknown if this instruction did not pass 1224 /// through the cost modeling. 1225 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1226 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1227 assert(VF.isVector() && "Expected VF >=2"); 1228 1229 // Cost model is not run in the VPlan-native path - return conservative 1230 // result until this changes. 1231 if (EnableVPlanNativePath) 1232 return CM_GatherScatter; 1233 1234 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1235 auto Itr = WideningDecisions.find(InstOnVF); 1236 if (Itr == WideningDecisions.end()) 1237 return CM_Unknown; 1238 return Itr->second.first; 1239 } 1240 1241 /// Return the vectorization cost for the given instruction \p I and vector 1242 /// width \p VF. 1243 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1244 assert(VF.isVector() && "Expected VF >=2"); 1245 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1246 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1247 "The cost is not calculated"); 1248 return WideningDecisions[InstOnVF].second; 1249 } 1250 1251 /// Return True if instruction \p I is an optimizable truncate whose operand 1252 /// is an induction variable. Such a truncate will be removed by adding a new 1253 /// induction variable with the destination type. 1254 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1255 // If the instruction is not a truncate, return false. 1256 auto *Trunc = dyn_cast<TruncInst>(I); 1257 if (!Trunc) 1258 return false; 1259 1260 // Get the source and destination types of the truncate. 1261 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1262 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1263 1264 // If the truncate is free for the given types, return false. Replacing a 1265 // free truncate with an induction variable would add an induction variable 1266 // update instruction to each iteration of the loop. We exclude from this 1267 // check the primary induction variable since it will need an update 1268 // instruction regardless. 1269 Value *Op = Trunc->getOperand(0); 1270 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1271 return false; 1272 1273 // If the truncated value is not an induction variable, return false. 1274 return Legal->isInductionPhi(Op); 1275 } 1276 1277 /// Collects the instructions to scalarize for each predicated instruction in 1278 /// the loop. 1279 void collectInstsToScalarize(ElementCount VF); 1280 1281 /// Collect Uniform and Scalar values for the given \p VF. 1282 /// The sets depend on CM decision for Load/Store instructions 1283 /// that may be vectorized as interleave, gather-scatter or scalarized. 1284 void collectUniformsAndScalars(ElementCount VF) { 1285 // Do the analysis once. 1286 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1287 return; 1288 setCostBasedWideningDecision(VF); 1289 collectLoopUniforms(VF); 1290 collectLoopScalars(VF); 1291 } 1292 1293 /// Returns true if the target machine supports masked store operation 1294 /// for the given \p DataType and kind of access to \p Ptr. 1295 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1296 return Legal->isConsecutivePtr(Ptr) && 1297 TTI.isLegalMaskedStore(DataType, Alignment); 1298 } 1299 1300 /// Returns true if the target machine supports masked load operation 1301 /// for the given \p DataType and kind of access to \p Ptr. 1302 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1303 return Legal->isConsecutivePtr(Ptr) && 1304 TTI.isLegalMaskedLoad(DataType, Alignment); 1305 } 1306 1307 /// Returns true if the target machine supports masked scatter operation 1308 /// for the given \p DataType. 1309 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1310 return TTI.isLegalMaskedScatter(DataType, Alignment); 1311 } 1312 1313 /// Returns true if the target machine supports masked gather operation 1314 /// for the given \p DataType. 1315 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1316 return TTI.isLegalMaskedGather(DataType, Alignment); 1317 } 1318 1319 /// Returns true if the target machine can represent \p V as a masked gather 1320 /// or scatter operation. 1321 bool isLegalGatherOrScatter(Value *V) { 1322 bool LI = isa<LoadInst>(V); 1323 bool SI = isa<StoreInst>(V); 1324 if (!LI && !SI) 1325 return false; 1326 auto *Ty = getMemInstValueType(V); 1327 Align Align = getLoadStoreAlignment(V); 1328 return (LI && isLegalMaskedGather(Ty, Align)) || 1329 (SI && isLegalMaskedScatter(Ty, Align)); 1330 } 1331 1332 /// Returns true if \p I is an instruction that will be scalarized with 1333 /// predication. Such instructions include conditional stores and 1334 /// instructions that may divide by zero. 1335 /// If a non-zero VF has been calculated, we check if I will be scalarized 1336 /// predication for that VF. 1337 bool isScalarWithPredication(Instruction *I, 1338 ElementCount VF = ElementCount::getFixed(1)); 1339 1340 // Returns true if \p I is an instruction that will be predicated either 1341 // through scalar predication or masked load/store or masked gather/scatter. 1342 // Superset of instructions that return true for isScalarWithPredication. 1343 bool isPredicatedInst(Instruction *I) { 1344 if (!blockNeedsPredication(I->getParent())) 1345 return false; 1346 // Loads and stores that need some form of masked operation are predicated 1347 // instructions. 1348 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1349 return Legal->isMaskRequired(I); 1350 return isScalarWithPredication(I); 1351 } 1352 1353 /// Returns true if \p I is a memory instruction with consecutive memory 1354 /// access that can be widened. 1355 bool 1356 memoryInstructionCanBeWidened(Instruction *I, 1357 ElementCount VF = ElementCount::getFixed(1)); 1358 1359 /// Returns true if \p I is a memory instruction in an interleaved-group 1360 /// of memory accesses that can be vectorized with wide vector loads/stores 1361 /// and shuffles. 1362 bool 1363 interleavedAccessCanBeWidened(Instruction *I, 1364 ElementCount VF = ElementCount::getFixed(1)); 1365 1366 /// Check if \p Instr belongs to any interleaved access group. 1367 bool isAccessInterleaved(Instruction *Instr) { 1368 return InterleaveInfo.isInterleaved(Instr); 1369 } 1370 1371 /// Get the interleaved access group that \p Instr belongs to. 1372 const InterleaveGroup<Instruction> * 1373 getInterleavedAccessGroup(Instruction *Instr) { 1374 return InterleaveInfo.getInterleaveGroup(Instr); 1375 } 1376 1377 /// Returns true if an interleaved group requires a scalar iteration 1378 /// to handle accesses with gaps, and there is nothing preventing us from 1379 /// creating a scalar epilogue. 1380 bool requiresScalarEpilogue() const { 1381 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1382 } 1383 1384 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1385 /// loop hint annotation. 1386 bool isScalarEpilogueAllowed() const { 1387 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1388 } 1389 1390 /// Returns true if all loop blocks should be masked to fold tail loop. 1391 bool foldTailByMasking() const { return FoldTailByMasking; } 1392 1393 bool blockNeedsPredication(BasicBlock *BB) { 1394 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1395 } 1396 1397 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1398 /// nodes to the chain of instructions representing the reductions. Uses a 1399 /// MapVector to ensure deterministic iteration order. 1400 using ReductionChainMap = 1401 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1402 1403 /// Return the chain of instructions representing an inloop reduction. 1404 const ReductionChainMap &getInLoopReductionChains() const { 1405 return InLoopReductionChains; 1406 } 1407 1408 /// Returns true if the Phi is part of an inloop reduction. 1409 bool isInLoopReduction(PHINode *Phi) const { 1410 return InLoopReductionChains.count(Phi); 1411 } 1412 1413 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1414 /// with factor VF. Return the cost of the instruction, including 1415 /// scalarization overhead if it's needed. 1416 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1417 1418 /// Estimate cost of a call instruction CI if it were vectorized with factor 1419 /// VF. Return the cost of the instruction, including scalarization overhead 1420 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1421 /// scalarized - 1422 /// i.e. either vector version isn't available, or is too expensive. 1423 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1424 bool &NeedToScalarize); 1425 1426 /// Invalidates decisions already taken by the cost model. 1427 void invalidateCostModelingDecisions() { 1428 WideningDecisions.clear(); 1429 Uniforms.clear(); 1430 Scalars.clear(); 1431 } 1432 1433 private: 1434 unsigned NumPredStores = 0; 1435 1436 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1437 /// than zero. One is returned if vectorization should best be avoided due 1438 /// to cost. 1439 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1440 1441 /// The vectorization cost is a combination of the cost itself and a boolean 1442 /// indicating whether any of the contributing operations will actually 1443 /// operate on 1444 /// vector values after type legalization in the backend. If this latter value 1445 /// is 1446 /// false, then all operations will be scalarized (i.e. no vectorization has 1447 /// actually taken place). 1448 using VectorizationCostTy = std::pair<unsigned, bool>; 1449 1450 /// Returns the expected execution cost. The unit of the cost does 1451 /// not matter because we use the 'cost' units to compare different 1452 /// vector widths. The cost that is returned is *not* normalized by 1453 /// the factor width. 1454 VectorizationCostTy expectedCost(ElementCount VF); 1455 1456 /// Returns the execution time cost of an instruction for a given vector 1457 /// width. Vector width of one means scalar. 1458 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1459 1460 /// The cost-computation logic from getInstructionCost which provides 1461 /// the vector type as an output parameter. 1462 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1463 1464 /// Calculate vectorization cost of memory instruction \p I. 1465 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1466 1467 /// The cost computation for scalarized memory instruction. 1468 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1469 1470 /// The cost computation for interleaving group of memory instructions. 1471 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1472 1473 /// The cost computation for Gather/Scatter instruction. 1474 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1475 1476 /// The cost computation for widening instruction \p I with consecutive 1477 /// memory access. 1478 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1479 1480 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1481 /// Load: scalar load + broadcast. 1482 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1483 /// element) 1484 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1485 1486 /// Estimate the overhead of scalarizing an instruction. This is a 1487 /// convenience wrapper for the type-based getScalarizationOverhead API. 1488 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1489 1490 /// Returns whether the instruction is a load or store and will be a emitted 1491 /// as a vector operation. 1492 bool isConsecutiveLoadOrStore(Instruction *I); 1493 1494 /// Returns true if an artificially high cost for emulated masked memrefs 1495 /// should be used. 1496 bool useEmulatedMaskMemRefHack(Instruction *I); 1497 1498 /// Map of scalar integer values to the smallest bitwidth they can be legally 1499 /// represented as. The vector equivalents of these values should be truncated 1500 /// to this type. 1501 MapVector<Instruction *, uint64_t> MinBWs; 1502 1503 /// A type representing the costs for instructions if they were to be 1504 /// scalarized rather than vectorized. The entries are Instruction-Cost 1505 /// pairs. 1506 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1507 1508 /// A set containing all BasicBlocks that are known to present after 1509 /// vectorization as a predicated block. 1510 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1511 1512 /// Records whether it is allowed to have the original scalar loop execute at 1513 /// least once. This may be needed as a fallback loop in case runtime 1514 /// aliasing/dependence checks fail, or to handle the tail/remainder 1515 /// iterations when the trip count is unknown or doesn't divide by the VF, 1516 /// or as a peel-loop to handle gaps in interleave-groups. 1517 /// Under optsize and when the trip count is very small we don't allow any 1518 /// iterations to execute in the scalar loop. 1519 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1520 1521 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1522 bool FoldTailByMasking = false; 1523 1524 /// A map holding scalar costs for different vectorization factors. The 1525 /// presence of a cost for an instruction in the mapping indicates that the 1526 /// instruction will be scalarized when vectorizing with the associated 1527 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1528 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1529 1530 /// Holds the instructions known to be uniform after vectorization. 1531 /// The data is collected per VF. 1532 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1533 1534 /// Holds the instructions known to be scalar after vectorization. 1535 /// The data is collected per VF. 1536 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1537 1538 /// Holds the instructions (address computations) that are forced to be 1539 /// scalarized. 1540 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1541 1542 /// PHINodes of the reductions that should be expanded in-loop along with 1543 /// their associated chains of reduction operations, in program order from top 1544 /// (PHI) to bottom 1545 ReductionChainMap InLoopReductionChains; 1546 1547 /// Returns the expected difference in cost from scalarizing the expression 1548 /// feeding a predicated instruction \p PredInst. The instructions to 1549 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1550 /// non-negative return value implies the expression will be scalarized. 1551 /// Currently, only single-use chains are considered for scalarization. 1552 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1553 ElementCount VF); 1554 1555 /// Collect the instructions that are uniform after vectorization. An 1556 /// instruction is uniform if we represent it with a single scalar value in 1557 /// the vectorized loop corresponding to each vector iteration. Examples of 1558 /// uniform instructions include pointer operands of consecutive or 1559 /// interleaved memory accesses. Note that although uniformity implies an 1560 /// instruction will be scalar, the reverse is not true. In general, a 1561 /// scalarized instruction will be represented by VF scalar values in the 1562 /// vectorized loop, each corresponding to an iteration of the original 1563 /// scalar loop. 1564 void collectLoopUniforms(ElementCount VF); 1565 1566 /// Collect the instructions that are scalar after vectorization. An 1567 /// instruction is scalar if it is known to be uniform or will be scalarized 1568 /// during vectorization. Non-uniform scalarized instructions will be 1569 /// represented by VF values in the vectorized loop, each corresponding to an 1570 /// iteration of the original scalar loop. 1571 void collectLoopScalars(ElementCount VF); 1572 1573 /// Keeps cost model vectorization decision and cost for instructions. 1574 /// Right now it is used for memory instructions only. 1575 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1576 std::pair<InstWidening, unsigned>>; 1577 1578 DecisionList WideningDecisions; 1579 1580 /// Returns true if \p V is expected to be vectorized and it needs to be 1581 /// extracted. 1582 bool needsExtract(Value *V, ElementCount VF) const { 1583 Instruction *I = dyn_cast<Instruction>(V); 1584 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1585 TheLoop->isLoopInvariant(I)) 1586 return false; 1587 1588 // Assume we can vectorize V (and hence we need extraction) if the 1589 // scalars are not computed yet. This can happen, because it is called 1590 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1591 // the scalars are collected. That should be a safe assumption in most 1592 // cases, because we check if the operands have vectorizable types 1593 // beforehand in LoopVectorizationLegality. 1594 return Scalars.find(VF) == Scalars.end() || 1595 !isScalarAfterVectorization(I, VF); 1596 }; 1597 1598 /// Returns a range containing only operands needing to be extracted. 1599 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1600 ElementCount VF) { 1601 return SmallVector<Value *, 4>(make_filter_range( 1602 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1603 } 1604 1605 public: 1606 /// The loop that we evaluate. 1607 Loop *TheLoop; 1608 1609 /// Predicated scalar evolution analysis. 1610 PredicatedScalarEvolution &PSE; 1611 1612 /// Loop Info analysis. 1613 LoopInfo *LI; 1614 1615 /// Vectorization legality. 1616 LoopVectorizationLegality *Legal; 1617 1618 /// Vector target information. 1619 const TargetTransformInfo &TTI; 1620 1621 /// Target Library Info. 1622 const TargetLibraryInfo *TLI; 1623 1624 /// Demanded bits analysis. 1625 DemandedBits *DB; 1626 1627 /// Assumption cache. 1628 AssumptionCache *AC; 1629 1630 /// Interface to emit optimization remarks. 1631 OptimizationRemarkEmitter *ORE; 1632 1633 const Function *TheFunction; 1634 1635 /// Loop Vectorize Hint. 1636 const LoopVectorizeHints *Hints; 1637 1638 /// The interleave access information contains groups of interleaved accesses 1639 /// with the same stride and close to each other. 1640 InterleavedAccessInfo &InterleaveInfo; 1641 1642 /// Values to ignore in the cost model. 1643 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1644 1645 /// Values to ignore in the cost model when VF > 1. 1646 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1647 }; 1648 1649 } // end namespace llvm 1650 1651 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1652 // vectorization. The loop needs to be annotated with #pragma omp simd 1653 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1654 // vector length information is not provided, vectorization is not considered 1655 // explicit. Interleave hints are not allowed either. These limitations will be 1656 // relaxed in the future. 1657 // Please, note that we are currently forced to abuse the pragma 'clang 1658 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1659 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1660 // provides *explicit vectorization hints* (LV can bypass legal checks and 1661 // assume that vectorization is legal). However, both hints are implemented 1662 // using the same metadata (llvm.loop.vectorize, processed by 1663 // LoopVectorizeHints). This will be fixed in the future when the native IR 1664 // representation for pragma 'omp simd' is introduced. 1665 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1666 OptimizationRemarkEmitter *ORE) { 1667 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1668 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1669 1670 // Only outer loops with an explicit vectorization hint are supported. 1671 // Unannotated outer loops are ignored. 1672 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1673 return false; 1674 1675 Function *Fn = OuterLp->getHeader()->getParent(); 1676 if (!Hints.allowVectorization(Fn, OuterLp, 1677 true /*VectorizeOnlyWhenForced*/)) { 1678 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1679 return false; 1680 } 1681 1682 if (Hints.getInterleave() > 1) { 1683 // TODO: Interleave support is future work. 1684 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1685 "outer loops.\n"); 1686 Hints.emitRemarkWithHints(); 1687 return false; 1688 } 1689 1690 return true; 1691 } 1692 1693 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1694 OptimizationRemarkEmitter *ORE, 1695 SmallVectorImpl<Loop *> &V) { 1696 // Collect inner loops and outer loops without irreducible control flow. For 1697 // now, only collect outer loops that have explicit vectorization hints. If we 1698 // are stress testing the VPlan H-CFG construction, we collect the outermost 1699 // loop of every loop nest. 1700 if (L.isInnermost() || VPlanBuildStressTest || 1701 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1702 LoopBlocksRPO RPOT(&L); 1703 RPOT.perform(LI); 1704 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1705 V.push_back(&L); 1706 // TODO: Collect inner loops inside marked outer loops in case 1707 // vectorization fails for the outer loop. Do not invoke 1708 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1709 // already known to be reducible. We can use an inherited attribute for 1710 // that. 1711 return; 1712 } 1713 } 1714 for (Loop *InnerL : L) 1715 collectSupportedLoops(*InnerL, LI, ORE, V); 1716 } 1717 1718 namespace { 1719 1720 /// The LoopVectorize Pass. 1721 struct LoopVectorize : public FunctionPass { 1722 /// Pass identification, replacement for typeid 1723 static char ID; 1724 1725 LoopVectorizePass Impl; 1726 1727 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1728 bool VectorizeOnlyWhenForced = false) 1729 : FunctionPass(ID), 1730 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1731 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1732 } 1733 1734 bool runOnFunction(Function &F) override { 1735 if (skipFunction(F)) 1736 return false; 1737 1738 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1739 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1740 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1741 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1742 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1743 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1744 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1745 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1746 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1747 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1748 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1749 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1750 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1751 1752 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1753 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1754 1755 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1756 GetLAA, *ORE, PSI).MadeAnyChange; 1757 } 1758 1759 void getAnalysisUsage(AnalysisUsage &AU) const override { 1760 AU.addRequired<AssumptionCacheTracker>(); 1761 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1762 AU.addRequired<DominatorTreeWrapperPass>(); 1763 AU.addRequired<LoopInfoWrapperPass>(); 1764 AU.addRequired<ScalarEvolutionWrapperPass>(); 1765 AU.addRequired<TargetTransformInfoWrapperPass>(); 1766 AU.addRequired<AAResultsWrapperPass>(); 1767 AU.addRequired<LoopAccessLegacyAnalysis>(); 1768 AU.addRequired<DemandedBitsWrapperPass>(); 1769 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1770 AU.addRequired<InjectTLIMappingsLegacy>(); 1771 1772 // We currently do not preserve loopinfo/dominator analyses with outer loop 1773 // vectorization. Until this is addressed, mark these analyses as preserved 1774 // only for non-VPlan-native path. 1775 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1776 if (!EnableVPlanNativePath) { 1777 AU.addPreserved<LoopInfoWrapperPass>(); 1778 AU.addPreserved<DominatorTreeWrapperPass>(); 1779 } 1780 1781 AU.addPreserved<BasicAAWrapperPass>(); 1782 AU.addPreserved<GlobalsAAWrapperPass>(); 1783 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1784 } 1785 }; 1786 1787 } // end anonymous namespace 1788 1789 //===----------------------------------------------------------------------===// 1790 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1791 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1792 //===----------------------------------------------------------------------===// 1793 1794 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1795 // We need to place the broadcast of invariant variables outside the loop, 1796 // but only if it's proven safe to do so. Else, broadcast will be inside 1797 // vector loop body. 1798 Instruction *Instr = dyn_cast<Instruction>(V); 1799 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1800 (!Instr || 1801 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1802 // Place the code for broadcasting invariant variables in the new preheader. 1803 IRBuilder<>::InsertPointGuard Guard(Builder); 1804 if (SafeToHoist) 1805 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1806 1807 // Broadcast the scalar into all locations in the vector. 1808 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1809 1810 return Shuf; 1811 } 1812 1813 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1814 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1815 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1816 "Expected either an induction phi-node or a truncate of it!"); 1817 Value *Start = II.getStartValue(); 1818 1819 // Construct the initial value of the vector IV in the vector loop preheader 1820 auto CurrIP = Builder.saveIP(); 1821 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1822 if (isa<TruncInst>(EntryVal)) { 1823 assert(Start->getType()->isIntegerTy() && 1824 "Truncation requires an integer type"); 1825 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1826 Step = Builder.CreateTrunc(Step, TruncType); 1827 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1828 } 1829 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1830 Value *SteppedStart = 1831 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1832 1833 // We create vector phi nodes for both integer and floating-point induction 1834 // variables. Here, we determine the kind of arithmetic we will perform. 1835 Instruction::BinaryOps AddOp; 1836 Instruction::BinaryOps MulOp; 1837 if (Step->getType()->isIntegerTy()) { 1838 AddOp = Instruction::Add; 1839 MulOp = Instruction::Mul; 1840 } else { 1841 AddOp = II.getInductionOpcode(); 1842 MulOp = Instruction::FMul; 1843 } 1844 1845 // Multiply the vectorization factor by the step using integer or 1846 // floating-point arithmetic as appropriate. 1847 Value *ConstVF = 1848 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1849 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1850 1851 // Create a vector splat to use in the induction update. 1852 // 1853 // FIXME: If the step is non-constant, we create the vector splat with 1854 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1855 // handle a constant vector splat. 1856 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1857 Value *SplatVF = isa<Constant>(Mul) 1858 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1859 : Builder.CreateVectorSplat(VF, Mul); 1860 Builder.restoreIP(CurrIP); 1861 1862 // We may need to add the step a number of times, depending on the unroll 1863 // factor. The last of those goes into the PHI. 1864 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1865 &*LoopVectorBody->getFirstInsertionPt()); 1866 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1867 Instruction *LastInduction = VecInd; 1868 for (unsigned Part = 0; Part < UF; ++Part) { 1869 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1870 1871 if (isa<TruncInst>(EntryVal)) 1872 addMetadata(LastInduction, EntryVal); 1873 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1874 1875 LastInduction = cast<Instruction>(addFastMathFlag( 1876 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1877 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1878 } 1879 1880 // Move the last step to the end of the latch block. This ensures consistent 1881 // placement of all induction updates. 1882 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1883 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1884 auto *ICmp = cast<Instruction>(Br->getCondition()); 1885 LastInduction->moveBefore(ICmp); 1886 LastInduction->setName("vec.ind.next"); 1887 1888 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1889 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1890 } 1891 1892 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1893 return Cost->isScalarAfterVectorization(I, VF) || 1894 Cost->isProfitableToScalarize(I, VF); 1895 } 1896 1897 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1898 if (shouldScalarizeInstruction(IV)) 1899 return true; 1900 auto isScalarInst = [&](User *U) -> bool { 1901 auto *I = cast<Instruction>(U); 1902 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1903 }; 1904 return llvm::any_of(IV->users(), isScalarInst); 1905 } 1906 1907 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1908 const InductionDescriptor &ID, const Instruction *EntryVal, 1909 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1910 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1911 "Expected either an induction phi-node or a truncate of it!"); 1912 1913 // This induction variable is not the phi from the original loop but the 1914 // newly-created IV based on the proof that casted Phi is equal to the 1915 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1916 // re-uses the same InductionDescriptor that original IV uses but we don't 1917 // have to do any recording in this case - that is done when original IV is 1918 // processed. 1919 if (isa<TruncInst>(EntryVal)) 1920 return; 1921 1922 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1923 if (Casts.empty()) 1924 return; 1925 // Only the first Cast instruction in the Casts vector is of interest. 1926 // The rest of the Casts (if exist) have no uses outside the 1927 // induction update chain itself. 1928 Instruction *CastInst = *Casts.begin(); 1929 if (Lane < UINT_MAX) 1930 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1931 else 1932 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1933 } 1934 1935 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1936 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1937 "Primary induction variable must have an integer type"); 1938 1939 auto II = Legal->getInductionVars().find(IV); 1940 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1941 1942 auto ID = II->second; 1943 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1944 1945 // The value from the original loop to which we are mapping the new induction 1946 // variable. 1947 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1948 1949 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1950 1951 // Generate code for the induction step. Note that induction steps are 1952 // required to be loop-invariant 1953 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1954 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1955 "Induction step should be loop invariant"); 1956 if (PSE.getSE()->isSCEVable(IV->getType())) { 1957 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1958 return Exp.expandCodeFor(Step, Step->getType(), 1959 LoopVectorPreHeader->getTerminator()); 1960 } 1961 return cast<SCEVUnknown>(Step)->getValue(); 1962 }; 1963 1964 // The scalar value to broadcast. This is derived from the canonical 1965 // induction variable. If a truncation type is given, truncate the canonical 1966 // induction variable and step. Otherwise, derive these values from the 1967 // induction descriptor. 1968 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1969 Value *ScalarIV = Induction; 1970 if (IV != OldInduction) { 1971 ScalarIV = IV->getType()->isIntegerTy() 1972 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1973 : Builder.CreateCast(Instruction::SIToFP, Induction, 1974 IV->getType()); 1975 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1976 ScalarIV->setName("offset.idx"); 1977 } 1978 if (Trunc) { 1979 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1980 assert(Step->getType()->isIntegerTy() && 1981 "Truncation requires an integer step"); 1982 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1983 Step = Builder.CreateTrunc(Step, TruncType); 1984 } 1985 return ScalarIV; 1986 }; 1987 1988 // Create the vector values from the scalar IV, in the absence of creating a 1989 // vector IV. 1990 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1991 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1992 for (unsigned Part = 0; Part < UF; ++Part) { 1993 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1994 Value *EntryPart = 1995 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1996 ID.getInductionOpcode()); 1997 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1998 if (Trunc) 1999 addMetadata(EntryPart, Trunc); 2000 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2001 } 2002 }; 2003 2004 // Now do the actual transformations, and start with creating the step value. 2005 Value *Step = CreateStepValue(ID.getStep()); 2006 if (VF.isZero() || VF.isScalar()) { 2007 Value *ScalarIV = CreateScalarIV(Step); 2008 CreateSplatIV(ScalarIV, Step); 2009 return; 2010 } 2011 2012 // Determine if we want a scalar version of the induction variable. This is 2013 // true if the induction variable itself is not widened, or if it has at 2014 // least one user in the loop that is not widened. 2015 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2016 if (!NeedsScalarIV) { 2017 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2018 return; 2019 } 2020 2021 // Try to create a new independent vector induction variable. If we can't 2022 // create the phi node, we will splat the scalar induction variable in each 2023 // loop iteration. 2024 if (!shouldScalarizeInstruction(EntryVal)) { 2025 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2026 Value *ScalarIV = CreateScalarIV(Step); 2027 // Create scalar steps that can be used by instructions we will later 2028 // scalarize. Note that the addition of the scalar steps will not increase 2029 // the number of instructions in the loop in the common case prior to 2030 // InstCombine. We will be trading one vector extract for each scalar step. 2031 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2032 return; 2033 } 2034 2035 // All IV users are scalar instructions, so only emit a scalar IV, not a 2036 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2037 // predicate used by the masked loads/stores. 2038 Value *ScalarIV = CreateScalarIV(Step); 2039 if (!Cost->isScalarEpilogueAllowed()) 2040 CreateSplatIV(ScalarIV, Step); 2041 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2042 } 2043 2044 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2045 Instruction::BinaryOps BinOp) { 2046 // Create and check the types. 2047 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2048 int VLen = ValVTy->getNumElements(); 2049 2050 Type *STy = Val->getType()->getScalarType(); 2051 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2052 "Induction Step must be an integer or FP"); 2053 assert(Step->getType() == STy && "Step has wrong type"); 2054 2055 SmallVector<Constant *, 8> Indices; 2056 2057 if (STy->isIntegerTy()) { 2058 // Create a vector of consecutive numbers from zero to VF. 2059 for (int i = 0; i < VLen; ++i) 2060 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2061 2062 // Add the consecutive indices to the vector value. 2063 Constant *Cv = ConstantVector::get(Indices); 2064 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2065 Step = Builder.CreateVectorSplat(VLen, Step); 2066 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2067 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2068 // which can be found from the original scalar operations. 2069 Step = Builder.CreateMul(Cv, Step); 2070 return Builder.CreateAdd(Val, Step, "induction"); 2071 } 2072 2073 // Floating point induction. 2074 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2075 "Binary Opcode should be specified for FP induction"); 2076 // Create a vector of consecutive numbers from zero to VF. 2077 for (int i = 0; i < VLen; ++i) 2078 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2079 2080 // Add the consecutive indices to the vector value. 2081 Constant *Cv = ConstantVector::get(Indices); 2082 2083 Step = Builder.CreateVectorSplat(VLen, Step); 2084 2085 // Floating point operations had to be 'fast' to enable the induction. 2086 FastMathFlags Flags; 2087 Flags.setFast(); 2088 2089 Value *MulOp = Builder.CreateFMul(Cv, Step); 2090 if (isa<Instruction>(MulOp)) 2091 // Have to check, MulOp may be a constant 2092 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2093 2094 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2095 if (isa<Instruction>(BOp)) 2096 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2097 return BOp; 2098 } 2099 2100 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2101 Instruction *EntryVal, 2102 const InductionDescriptor &ID) { 2103 // We shouldn't have to build scalar steps if we aren't vectorizing. 2104 assert(VF.isVector() && "VF should be greater than one"); 2105 assert(!VF.isScalable() && 2106 "the code below assumes a fixed number of elements at compile time"); 2107 // Get the value type and ensure it and the step have the same integer type. 2108 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2109 assert(ScalarIVTy == Step->getType() && 2110 "Val and Step should have the same type"); 2111 2112 // We build scalar steps for both integer and floating-point induction 2113 // variables. Here, we determine the kind of arithmetic we will perform. 2114 Instruction::BinaryOps AddOp; 2115 Instruction::BinaryOps MulOp; 2116 if (ScalarIVTy->isIntegerTy()) { 2117 AddOp = Instruction::Add; 2118 MulOp = Instruction::Mul; 2119 } else { 2120 AddOp = ID.getInductionOpcode(); 2121 MulOp = Instruction::FMul; 2122 } 2123 2124 // Determine the number of scalars we need to generate for each unroll 2125 // iteration. If EntryVal is uniform, we only need to generate the first 2126 // lane. Otherwise, we generate all VF values. 2127 unsigned Lanes = 2128 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2129 ? 1 2130 : VF.getKnownMinValue(); 2131 // Compute the scalar steps and save the results in VectorLoopValueMap. 2132 for (unsigned Part = 0; Part < UF; ++Part) { 2133 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2134 auto *StartIdx = getSignedIntOrFpConstant( 2135 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2136 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2137 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2138 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2139 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2140 } 2141 } 2142 } 2143 2144 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2145 assert(V != Induction && "The new induction variable should not be used."); 2146 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2147 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2148 2149 // If we have a stride that is replaced by one, do it here. Defer this for 2150 // the VPlan-native path until we start running Legal checks in that path. 2151 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2152 V = ConstantInt::get(V->getType(), 1); 2153 2154 // If we have a vector mapped to this value, return it. 2155 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2156 return VectorLoopValueMap.getVectorValue(V, Part); 2157 2158 // If the value has not been vectorized, check if it has been scalarized 2159 // instead. If it has been scalarized, and we actually need the value in 2160 // vector form, we will construct the vector values on demand. 2161 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2162 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2163 2164 // If we've scalarized a value, that value should be an instruction. 2165 auto *I = cast<Instruction>(V); 2166 2167 // If we aren't vectorizing, we can just copy the scalar map values over to 2168 // the vector map. 2169 if (VF == 1) { 2170 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2171 return ScalarValue; 2172 } 2173 2174 // Get the last scalar instruction we generated for V and Part. If the value 2175 // is known to be uniform after vectorization, this corresponds to lane zero 2176 // of the Part unroll iteration. Otherwise, the last instruction is the one 2177 // we created for the last vector lane of the Part unroll iteration. 2178 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2179 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2180 ? 0 2181 : VF.getKnownMinValue() - 1; 2182 auto *LastInst = cast<Instruction>( 2183 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2184 2185 // Set the insert point after the last scalarized instruction. This ensures 2186 // the insertelement sequence will directly follow the scalar definitions. 2187 auto OldIP = Builder.saveIP(); 2188 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2189 Builder.SetInsertPoint(&*NewIP); 2190 2191 // However, if we are vectorizing, we need to construct the vector values. 2192 // If the value is known to be uniform after vectorization, we can just 2193 // broadcast the scalar value corresponding to lane zero for each unroll 2194 // iteration. Otherwise, we construct the vector values using insertelement 2195 // instructions. Since the resulting vectors are stored in 2196 // VectorLoopValueMap, we will only generate the insertelements once. 2197 Value *VectorValue = nullptr; 2198 if (Cost->isUniformAfterVectorization(I, VF)) { 2199 VectorValue = getBroadcastInstrs(ScalarValue); 2200 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2201 } else { 2202 // Initialize packing with insertelements to start from undef. 2203 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2204 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2205 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2206 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2207 packScalarIntoVectorValue(V, {Part, Lane}); 2208 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2209 } 2210 Builder.restoreIP(OldIP); 2211 return VectorValue; 2212 } 2213 2214 // If this scalar is unknown, assume that it is a constant or that it is 2215 // loop invariant. Broadcast V and save the value for future uses. 2216 Value *B = getBroadcastInstrs(V); 2217 VectorLoopValueMap.setVectorValue(V, Part, B); 2218 return B; 2219 } 2220 2221 Value * 2222 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2223 const VPIteration &Instance) { 2224 // If the value is not an instruction contained in the loop, it should 2225 // already be scalar. 2226 if (OrigLoop->isLoopInvariant(V)) 2227 return V; 2228 2229 assert(Instance.Lane > 0 2230 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2231 : true && "Uniform values only have lane zero"); 2232 2233 // If the value from the original loop has not been vectorized, it is 2234 // represented by UF x VF scalar values in the new loop. Return the requested 2235 // scalar value. 2236 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2237 return VectorLoopValueMap.getScalarValue(V, Instance); 2238 2239 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2240 // for the given unroll part. If this entry is not a vector type (i.e., the 2241 // vectorization factor is one), there is no need to generate an 2242 // extractelement instruction. 2243 auto *U = getOrCreateVectorValue(V, Instance.Part); 2244 if (!U->getType()->isVectorTy()) { 2245 assert(VF == 1 && "Value not scalarized has non-vector type"); 2246 return U; 2247 } 2248 2249 // Otherwise, the value from the original loop has been vectorized and is 2250 // represented by UF vector values. Extract and return the requested scalar 2251 // value from the appropriate vector lane. 2252 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2253 } 2254 2255 void InnerLoopVectorizer::packScalarIntoVectorValue( 2256 Value *V, const VPIteration &Instance) { 2257 assert(V != Induction && "The new induction variable should not be used."); 2258 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2259 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2260 2261 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2262 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2263 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2264 Builder.getInt32(Instance.Lane)); 2265 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2266 } 2267 2268 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2269 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2270 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2271 SmallVector<int, 8> ShuffleMask; 2272 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2273 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2274 2275 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2276 } 2277 2278 // Return whether we allow using masked interleave-groups (for dealing with 2279 // strided loads/stores that reside in predicated blocks, or for dealing 2280 // with gaps). 2281 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2282 // If an override option has been passed in for interleaved accesses, use it. 2283 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2284 return EnableMaskedInterleavedMemAccesses; 2285 2286 return TTI.enableMaskedInterleavedAccessVectorization(); 2287 } 2288 2289 // Try to vectorize the interleave group that \p Instr belongs to. 2290 // 2291 // E.g. Translate following interleaved load group (factor = 3): 2292 // for (i = 0; i < N; i+=3) { 2293 // R = Pic[i]; // Member of index 0 2294 // G = Pic[i+1]; // Member of index 1 2295 // B = Pic[i+2]; // Member of index 2 2296 // ... // do something to R, G, B 2297 // } 2298 // To: 2299 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2300 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2301 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2302 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2303 // 2304 // Or translate following interleaved store group (factor = 3): 2305 // for (i = 0; i < N; i+=3) { 2306 // ... do something to R, G, B 2307 // Pic[i] = R; // Member of index 0 2308 // Pic[i+1] = G; // Member of index 1 2309 // Pic[i+2] = B; // Member of index 2 2310 // } 2311 // To: 2312 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2313 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2314 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2315 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2316 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2317 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2318 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2319 VPValue *Addr, VPValue *BlockInMask) { 2320 Instruction *Instr = Group->getInsertPos(); 2321 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2322 2323 // Prepare for the vector type of the interleaved load/store. 2324 Type *ScalarTy = getMemInstValueType(Instr); 2325 unsigned InterleaveFactor = Group->getFactor(); 2326 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2327 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2328 2329 // Prepare for the new pointers. 2330 SmallVector<Value *, 2> AddrParts; 2331 unsigned Index = Group->getIndex(Instr); 2332 2333 // TODO: extend the masked interleaved-group support to reversed access. 2334 assert((!BlockInMask || !Group->isReverse()) && 2335 "Reversed masked interleave-group not supported."); 2336 2337 // If the group is reverse, adjust the index to refer to the last vector lane 2338 // instead of the first. We adjust the index from the first vector lane, 2339 // rather than directly getting the pointer for lane VF - 1, because the 2340 // pointer operand of the interleaved access is supposed to be uniform. For 2341 // uniform instructions, we're only required to generate a value for the 2342 // first vector lane in each unroll iteration. 2343 assert(!VF.isScalable() && 2344 "scalable vector reverse operation is not implemented"); 2345 if (Group->isReverse()) 2346 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2347 2348 for (unsigned Part = 0; Part < UF; Part++) { 2349 Value *AddrPart = State.get(Addr, {Part, 0}); 2350 setDebugLocFromInst(Builder, AddrPart); 2351 2352 // Notice current instruction could be any index. Need to adjust the address 2353 // to the member of index 0. 2354 // 2355 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2356 // b = A[i]; // Member of index 0 2357 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2358 // 2359 // E.g. A[i+1] = a; // Member of index 1 2360 // A[i] = b; // Member of index 0 2361 // A[i+2] = c; // Member of index 2 (Current instruction) 2362 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2363 2364 bool InBounds = false; 2365 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2366 InBounds = gep->isInBounds(); 2367 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2368 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2369 2370 // Cast to the vector pointer type. 2371 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2372 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2373 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2374 } 2375 2376 setDebugLocFromInst(Builder, Instr); 2377 Value *UndefVec = UndefValue::get(VecTy); 2378 2379 Value *MaskForGaps = nullptr; 2380 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2381 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2382 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2383 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2384 } 2385 2386 // Vectorize the interleaved load group. 2387 if (isa<LoadInst>(Instr)) { 2388 // For each unroll part, create a wide load for the group. 2389 SmallVector<Value *, 2> NewLoads; 2390 for (unsigned Part = 0; Part < UF; Part++) { 2391 Instruction *NewLoad; 2392 if (BlockInMask || MaskForGaps) { 2393 assert(useMaskedInterleavedAccesses(*TTI) && 2394 "masked interleaved groups are not allowed."); 2395 Value *GroupMask = MaskForGaps; 2396 if (BlockInMask) { 2397 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2398 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2399 Value *ShuffledMask = Builder.CreateShuffleVector( 2400 BlockInMaskPart, 2401 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2402 "interleaved.mask"); 2403 GroupMask = MaskForGaps 2404 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2405 MaskForGaps) 2406 : ShuffledMask; 2407 } 2408 NewLoad = 2409 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2410 GroupMask, UndefVec, "wide.masked.vec"); 2411 } 2412 else 2413 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2414 Group->getAlign(), "wide.vec"); 2415 Group->addMetadata(NewLoad); 2416 NewLoads.push_back(NewLoad); 2417 } 2418 2419 // For each member in the group, shuffle out the appropriate data from the 2420 // wide loads. 2421 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2422 Instruction *Member = Group->getMember(I); 2423 2424 // Skip the gaps in the group. 2425 if (!Member) 2426 continue; 2427 2428 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2429 auto StrideMask = 2430 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2431 for (unsigned Part = 0; Part < UF; Part++) { 2432 Value *StridedVec = Builder.CreateShuffleVector( 2433 NewLoads[Part], StrideMask, "strided.vec"); 2434 2435 // If this member has different type, cast the result type. 2436 if (Member->getType() != ScalarTy) { 2437 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2438 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2439 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2440 } 2441 2442 if (Group->isReverse()) 2443 StridedVec = reverseVector(StridedVec); 2444 2445 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2446 } 2447 } 2448 return; 2449 } 2450 2451 // The sub vector type for current instruction. 2452 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2453 auto *SubVT = VectorType::get(ScalarTy, VF); 2454 2455 // Vectorize the interleaved store group. 2456 for (unsigned Part = 0; Part < UF; Part++) { 2457 // Collect the stored vector from each member. 2458 SmallVector<Value *, 4> StoredVecs; 2459 for (unsigned i = 0; i < InterleaveFactor; i++) { 2460 // Interleaved store group doesn't allow a gap, so each index has a member 2461 Instruction *Member = Group->getMember(i); 2462 assert(Member && "Fail to get a member from an interleaved store group"); 2463 2464 Value *StoredVec = getOrCreateVectorValue( 2465 cast<StoreInst>(Member)->getValueOperand(), Part); 2466 if (Group->isReverse()) 2467 StoredVec = reverseVector(StoredVec); 2468 2469 // If this member has different type, cast it to a unified type. 2470 2471 if (StoredVec->getType() != SubVT) 2472 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2473 2474 StoredVecs.push_back(StoredVec); 2475 } 2476 2477 // Concatenate all vectors into a wide vector. 2478 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2479 2480 // Interleave the elements in the wide vector. 2481 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2482 Value *IVec = Builder.CreateShuffleVector( 2483 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2484 "interleaved.vec"); 2485 2486 Instruction *NewStoreInstr; 2487 if (BlockInMask) { 2488 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2489 Value *ShuffledMask = Builder.CreateShuffleVector( 2490 BlockInMaskPart, 2491 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2492 "interleaved.mask"); 2493 NewStoreInstr = Builder.CreateMaskedStore( 2494 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2495 } 2496 else 2497 NewStoreInstr = 2498 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2499 2500 Group->addMetadata(NewStoreInstr); 2501 } 2502 } 2503 2504 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2505 VPTransformState &State, 2506 VPValue *Addr, 2507 VPValue *StoredValue, 2508 VPValue *BlockInMask) { 2509 // Attempt to issue a wide load. 2510 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2511 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2512 2513 assert((LI || SI) && "Invalid Load/Store instruction"); 2514 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2515 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2516 2517 LoopVectorizationCostModel::InstWidening Decision = 2518 Cost->getWideningDecision(Instr, VF); 2519 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2520 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2521 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2522 "CM decision is not to widen the memory instruction"); 2523 2524 Type *ScalarDataTy = getMemInstValueType(Instr); 2525 2526 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2527 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2528 const Align Alignment = getLoadStoreAlignment(Instr); 2529 2530 // Determine if the pointer operand of the access is either consecutive or 2531 // reverse consecutive. 2532 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2533 bool ConsecutiveStride = 2534 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2535 bool CreateGatherScatter = 2536 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2537 2538 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2539 // gather/scatter. Otherwise Decision should have been to Scalarize. 2540 assert((ConsecutiveStride || CreateGatherScatter) && 2541 "The instruction should be scalarized"); 2542 (void)ConsecutiveStride; 2543 2544 VectorParts BlockInMaskParts(UF); 2545 bool isMaskRequired = BlockInMask; 2546 if (isMaskRequired) 2547 for (unsigned Part = 0; Part < UF; ++Part) 2548 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2549 2550 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2551 // Calculate the pointer for the specific unroll-part. 2552 GetElementPtrInst *PartPtr = nullptr; 2553 2554 bool InBounds = false; 2555 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2556 InBounds = gep->isInBounds(); 2557 2558 if (Reverse) { 2559 // If the address is consecutive but reversed, then the 2560 // wide store needs to start at the last vector element. 2561 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2562 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2563 PartPtr->setIsInBounds(InBounds); 2564 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2565 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2566 PartPtr->setIsInBounds(InBounds); 2567 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2568 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2569 } else { 2570 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2571 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2572 PartPtr->setIsInBounds(InBounds); 2573 } 2574 2575 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2576 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2577 }; 2578 2579 // Handle Stores: 2580 if (SI) { 2581 setDebugLocFromInst(Builder, SI); 2582 2583 for (unsigned Part = 0; Part < UF; ++Part) { 2584 Instruction *NewSI = nullptr; 2585 Value *StoredVal = State.get(StoredValue, Part); 2586 if (CreateGatherScatter) { 2587 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2588 Value *VectorGep = State.get(Addr, Part); 2589 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2590 MaskPart); 2591 } else { 2592 if (Reverse) { 2593 // If we store to reverse consecutive memory locations, then we need 2594 // to reverse the order of elements in the stored value. 2595 StoredVal = reverseVector(StoredVal); 2596 // We don't want to update the value in the map as it might be used in 2597 // another expression. So don't call resetVectorValue(StoredVal). 2598 } 2599 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2600 if (isMaskRequired) 2601 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2602 BlockInMaskParts[Part]); 2603 else 2604 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2605 } 2606 addMetadata(NewSI, SI); 2607 } 2608 return; 2609 } 2610 2611 // Handle loads. 2612 assert(LI && "Must have a load instruction"); 2613 setDebugLocFromInst(Builder, LI); 2614 for (unsigned Part = 0; Part < UF; ++Part) { 2615 Value *NewLI; 2616 if (CreateGatherScatter) { 2617 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2618 Value *VectorGep = State.get(Addr, Part); 2619 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2620 nullptr, "wide.masked.gather"); 2621 addMetadata(NewLI, LI); 2622 } else { 2623 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2624 if (isMaskRequired) 2625 NewLI = Builder.CreateMaskedLoad( 2626 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2627 "wide.masked.load"); 2628 else 2629 NewLI = 2630 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2631 2632 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2633 addMetadata(NewLI, LI); 2634 if (Reverse) 2635 NewLI = reverseVector(NewLI); 2636 } 2637 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2638 } 2639 } 2640 2641 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2642 const VPIteration &Instance, 2643 bool IfPredicateInstr, 2644 VPTransformState &State) { 2645 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2646 2647 setDebugLocFromInst(Builder, Instr); 2648 2649 // Does this instruction return a value ? 2650 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2651 2652 Instruction *Cloned = Instr->clone(); 2653 if (!IsVoidRetTy) 2654 Cloned->setName(Instr->getName() + ".cloned"); 2655 2656 // Replace the operands of the cloned instructions with their scalar 2657 // equivalents in the new loop. 2658 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2659 auto *NewOp = State.get(User.getOperand(op), Instance); 2660 Cloned->setOperand(op, NewOp); 2661 } 2662 addNewMetadata(Cloned, Instr); 2663 2664 // Place the cloned scalar in the new loop. 2665 Builder.Insert(Cloned); 2666 2667 // Add the cloned scalar to the scalar map entry. 2668 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2669 2670 // If we just cloned a new assumption, add it the assumption cache. 2671 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2672 if (II->getIntrinsicID() == Intrinsic::assume) 2673 AC->registerAssumption(II); 2674 2675 // End if-block. 2676 if (IfPredicateInstr) 2677 PredicatedInstructions.push_back(Cloned); 2678 } 2679 2680 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2681 Value *End, Value *Step, 2682 Instruction *DL) { 2683 BasicBlock *Header = L->getHeader(); 2684 BasicBlock *Latch = L->getLoopLatch(); 2685 // As we're just creating this loop, it's possible no latch exists 2686 // yet. If so, use the header as this will be a single block loop. 2687 if (!Latch) 2688 Latch = Header; 2689 2690 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2691 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2692 setDebugLocFromInst(Builder, OldInst); 2693 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2694 2695 Builder.SetInsertPoint(Latch->getTerminator()); 2696 setDebugLocFromInst(Builder, OldInst); 2697 2698 // Create i+1 and fill the PHINode. 2699 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2700 Induction->addIncoming(Start, L->getLoopPreheader()); 2701 Induction->addIncoming(Next, Latch); 2702 // Create the compare. 2703 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2704 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2705 2706 // Now we have two terminators. Remove the old one from the block. 2707 Latch->getTerminator()->eraseFromParent(); 2708 2709 return Induction; 2710 } 2711 2712 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2713 if (TripCount) 2714 return TripCount; 2715 2716 assert(L && "Create Trip Count for null loop."); 2717 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2718 // Find the loop boundaries. 2719 ScalarEvolution *SE = PSE.getSE(); 2720 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2721 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2722 "Invalid loop count"); 2723 2724 Type *IdxTy = Legal->getWidestInductionType(); 2725 assert(IdxTy && "No type for induction"); 2726 2727 // The exit count might have the type of i64 while the phi is i32. This can 2728 // happen if we have an induction variable that is sign extended before the 2729 // compare. The only way that we get a backedge taken count is that the 2730 // induction variable was signed and as such will not overflow. In such a case 2731 // truncation is legal. 2732 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2733 IdxTy->getPrimitiveSizeInBits()) 2734 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2735 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2736 2737 // Get the total trip count from the count by adding 1. 2738 const SCEV *ExitCount = SE->getAddExpr( 2739 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2740 2741 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2742 2743 // Expand the trip count and place the new instructions in the preheader. 2744 // Notice that the pre-header does not change, only the loop body. 2745 SCEVExpander Exp(*SE, DL, "induction"); 2746 2747 // Count holds the overall loop count (N). 2748 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2749 L->getLoopPreheader()->getTerminator()); 2750 2751 if (TripCount->getType()->isPointerTy()) 2752 TripCount = 2753 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2754 L->getLoopPreheader()->getTerminator()); 2755 2756 return TripCount; 2757 } 2758 2759 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2760 if (VectorTripCount) 2761 return VectorTripCount; 2762 2763 Value *TC = getOrCreateTripCount(L); 2764 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2765 2766 Type *Ty = TC->getType(); 2767 // This is where we can make the step a runtime constant. 2768 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2769 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2770 2771 // If the tail is to be folded by masking, round the number of iterations N 2772 // up to a multiple of Step instead of rounding down. This is done by first 2773 // adding Step-1 and then rounding down. Note that it's ok if this addition 2774 // overflows: the vector induction variable will eventually wrap to zero given 2775 // that it starts at zero and its Step is a power of two; the loop will then 2776 // exit, with the last early-exit vector comparison also producing all-true. 2777 if (Cost->foldTailByMasking()) { 2778 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2779 "VF*UF must be a power of 2 when folding tail by masking"); 2780 TC = Builder.CreateAdd( 2781 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2782 } 2783 2784 // Now we need to generate the expression for the part of the loop that the 2785 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2786 // iterations are not required for correctness, or N - Step, otherwise. Step 2787 // is equal to the vectorization factor (number of SIMD elements) times the 2788 // unroll factor (number of SIMD instructions). 2789 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2790 2791 // If there is a non-reversed interleaved group that may speculatively access 2792 // memory out-of-bounds, we need to ensure that there will be at least one 2793 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2794 // the trip count, we set the remainder to be equal to the step. If the step 2795 // does not evenly divide the trip count, no adjustment is necessary since 2796 // there will already be scalar iterations. Note that the minimum iterations 2797 // check ensures that N >= Step. 2798 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2799 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2800 R = Builder.CreateSelect(IsZero, Step, R); 2801 } 2802 2803 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2804 2805 return VectorTripCount; 2806 } 2807 2808 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2809 const DataLayout &DL) { 2810 // Verify that V is a vector type with same number of elements as DstVTy. 2811 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2812 unsigned VF = DstFVTy->getNumElements(); 2813 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2814 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2815 Type *SrcElemTy = SrcVecTy->getElementType(); 2816 Type *DstElemTy = DstFVTy->getElementType(); 2817 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2818 "Vector elements must have same size"); 2819 2820 // Do a direct cast if element types are castable. 2821 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2822 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2823 } 2824 // V cannot be directly casted to desired vector type. 2825 // May happen when V is a floating point vector but DstVTy is a vector of 2826 // pointers or vice-versa. Handle this using a two-step bitcast using an 2827 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2828 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2829 "Only one type should be a pointer type"); 2830 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2831 "Only one type should be a floating point type"); 2832 Type *IntTy = 2833 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2834 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2835 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2836 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2837 } 2838 2839 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2840 BasicBlock *Bypass) { 2841 Value *Count = getOrCreateTripCount(L); 2842 // Reuse existing vector loop preheader for TC checks. 2843 // Note that new preheader block is generated for vector loop. 2844 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2845 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2846 2847 // Generate code to check if the loop's trip count is less than VF * UF, or 2848 // equal to it in case a scalar epilogue is required; this implies that the 2849 // vector trip count is zero. This check also covers the case where adding one 2850 // to the backedge-taken count overflowed leading to an incorrect trip count 2851 // of zero. In this case we will also jump to the scalar loop. 2852 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2853 : ICmpInst::ICMP_ULT; 2854 2855 // If tail is to be folded, vector loop takes care of all iterations. 2856 Value *CheckMinIters = Builder.getFalse(); 2857 if (!Cost->foldTailByMasking()) { 2858 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2859 CheckMinIters = Builder.CreateICmp( 2860 P, Count, 2861 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2862 "min.iters.check"); 2863 } 2864 // Create new preheader for vector loop. 2865 LoopVectorPreHeader = 2866 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2867 "vector.ph"); 2868 2869 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2870 DT->getNode(Bypass)->getIDom()) && 2871 "TC check is expected to dominate Bypass"); 2872 2873 // Update dominator for Bypass & LoopExit. 2874 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2875 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2876 2877 ReplaceInstWithInst( 2878 TCCheckBlock->getTerminator(), 2879 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2880 LoopBypassBlocks.push_back(TCCheckBlock); 2881 } 2882 2883 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2884 // Reuse existing vector loop preheader for SCEV checks. 2885 // Note that new preheader block is generated for vector loop. 2886 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2887 2888 // Generate the code to check that the SCEV assumptions that we made. 2889 // We want the new basic block to start at the first instruction in a 2890 // sequence of instructions that form a check. 2891 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2892 "scev.check"); 2893 Value *SCEVCheck = Exp.expandCodeForPredicate( 2894 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2895 2896 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2897 if (C->isZero()) 2898 return; 2899 2900 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2901 (OptForSizeBasedOnProfile && 2902 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2903 "Cannot SCEV check stride or overflow when optimizing for size"); 2904 2905 SCEVCheckBlock->setName("vector.scevcheck"); 2906 // Create new preheader for vector loop. 2907 LoopVectorPreHeader = 2908 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2909 nullptr, "vector.ph"); 2910 2911 // Update dominator only if this is first RT check. 2912 if (LoopBypassBlocks.empty()) { 2913 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2914 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2915 } 2916 2917 ReplaceInstWithInst( 2918 SCEVCheckBlock->getTerminator(), 2919 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2920 LoopBypassBlocks.push_back(SCEVCheckBlock); 2921 AddedSafetyChecks = true; 2922 } 2923 2924 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2925 // VPlan-native path does not do any analysis for runtime checks currently. 2926 if (EnableVPlanNativePath) 2927 return; 2928 2929 // Reuse existing vector loop preheader for runtime memory checks. 2930 // Note that new preheader block is generated for vector loop. 2931 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2932 2933 // Generate the code that checks in runtime if arrays overlap. We put the 2934 // checks into a separate block to make the more common case of few elements 2935 // faster. 2936 auto *LAI = Legal->getLAI(); 2937 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2938 if (!RtPtrChecking.Need) 2939 return; 2940 2941 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2942 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2943 "Cannot emit memory checks when optimizing for size, unless forced " 2944 "to vectorize."); 2945 ORE->emit([&]() { 2946 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2947 L->getStartLoc(), L->getHeader()) 2948 << "Code-size may be reduced by not forcing " 2949 "vectorization, or by source-code modifications " 2950 "eliminating the need for runtime checks " 2951 "(e.g., adding 'restrict')."; 2952 }); 2953 } 2954 2955 MemCheckBlock->setName("vector.memcheck"); 2956 // Create new preheader for vector loop. 2957 LoopVectorPreHeader = 2958 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2959 "vector.ph"); 2960 2961 auto *CondBranch = cast<BranchInst>( 2962 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 2963 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 2964 LoopBypassBlocks.push_back(MemCheckBlock); 2965 AddedSafetyChecks = true; 2966 2967 // Update dominator only if this is first RT check. 2968 if (LoopBypassBlocks.empty()) { 2969 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2970 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2971 } 2972 2973 Instruction *FirstCheckInst; 2974 Instruction *MemRuntimeCheck; 2975 std::tie(FirstCheckInst, MemRuntimeCheck) = 2976 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2977 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2978 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2979 "claimed checks are required"); 2980 CondBranch->setCondition(MemRuntimeCheck); 2981 2982 // We currently don't use LoopVersioning for the actual loop cloning but we 2983 // still use it to add the noalias metadata. 2984 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2985 PSE.getSE()); 2986 LVer->prepareNoAliasMetadata(); 2987 } 2988 2989 Value *InnerLoopVectorizer::emitTransformedIndex( 2990 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2991 const InductionDescriptor &ID) const { 2992 2993 SCEVExpander Exp(*SE, DL, "induction"); 2994 auto Step = ID.getStep(); 2995 auto StartValue = ID.getStartValue(); 2996 assert(Index->getType() == Step->getType() && 2997 "Index type does not match StepValue type"); 2998 2999 // Note: the IR at this point is broken. We cannot use SE to create any new 3000 // SCEV and then expand it, hoping that SCEV's simplification will give us 3001 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3002 // lead to various SCEV crashes. So all we can do is to use builder and rely 3003 // on InstCombine for future simplifications. Here we handle some trivial 3004 // cases only. 3005 auto CreateAdd = [&B](Value *X, Value *Y) { 3006 assert(X->getType() == Y->getType() && "Types don't match!"); 3007 if (auto *CX = dyn_cast<ConstantInt>(X)) 3008 if (CX->isZero()) 3009 return Y; 3010 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3011 if (CY->isZero()) 3012 return X; 3013 return B.CreateAdd(X, Y); 3014 }; 3015 3016 auto CreateMul = [&B](Value *X, Value *Y) { 3017 assert(X->getType() == Y->getType() && "Types don't match!"); 3018 if (auto *CX = dyn_cast<ConstantInt>(X)) 3019 if (CX->isOne()) 3020 return Y; 3021 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3022 if (CY->isOne()) 3023 return X; 3024 return B.CreateMul(X, Y); 3025 }; 3026 3027 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3028 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3029 // the DomTree is not kept up-to-date for additional blocks generated in the 3030 // vector loop. By using the header as insertion point, we guarantee that the 3031 // expanded instructions dominate all their uses. 3032 auto GetInsertPoint = [this, &B]() { 3033 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3034 if (InsertBB != LoopVectorBody && 3035 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3036 return LoopVectorBody->getTerminator(); 3037 return &*B.GetInsertPoint(); 3038 }; 3039 switch (ID.getKind()) { 3040 case InductionDescriptor::IK_IntInduction: { 3041 assert(Index->getType() == StartValue->getType() && 3042 "Index type does not match StartValue type"); 3043 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3044 return B.CreateSub(StartValue, Index); 3045 auto *Offset = CreateMul( 3046 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3047 return CreateAdd(StartValue, Offset); 3048 } 3049 case InductionDescriptor::IK_PtrInduction: { 3050 assert(isa<SCEVConstant>(Step) && 3051 "Expected constant step for pointer induction"); 3052 return B.CreateGEP( 3053 StartValue->getType()->getPointerElementType(), StartValue, 3054 CreateMul(Index, 3055 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3056 } 3057 case InductionDescriptor::IK_FpInduction: { 3058 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3059 auto InductionBinOp = ID.getInductionBinOp(); 3060 assert(InductionBinOp && 3061 (InductionBinOp->getOpcode() == Instruction::FAdd || 3062 InductionBinOp->getOpcode() == Instruction::FSub) && 3063 "Original bin op should be defined for FP induction"); 3064 3065 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3066 3067 // Floating point operations had to be 'fast' to enable the induction. 3068 FastMathFlags Flags; 3069 Flags.setFast(); 3070 3071 Value *MulExp = B.CreateFMul(StepValue, Index); 3072 if (isa<Instruction>(MulExp)) 3073 // We have to check, the MulExp may be a constant. 3074 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3075 3076 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3077 "induction"); 3078 if (isa<Instruction>(BOp)) 3079 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3080 3081 return BOp; 3082 } 3083 case InductionDescriptor::IK_NoInduction: 3084 return nullptr; 3085 } 3086 llvm_unreachable("invalid enum"); 3087 } 3088 3089 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3090 LoopScalarBody = OrigLoop->getHeader(); 3091 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3092 LoopExitBlock = OrigLoop->getExitBlock(); 3093 assert(LoopExitBlock && "Must have an exit block"); 3094 assert(LoopVectorPreHeader && "Invalid loop structure"); 3095 3096 LoopMiddleBlock = 3097 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3098 LI, nullptr, Twine(Prefix) + "middle.block"); 3099 LoopScalarPreHeader = 3100 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3101 nullptr, Twine(Prefix) + "scalar.ph"); 3102 // We intentionally don't let SplitBlock to update LoopInfo since 3103 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3104 // LoopVectorBody is explicitly added to the correct place few lines later. 3105 LoopVectorBody = 3106 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3107 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3108 3109 // Update dominator for loop exit. 3110 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3111 3112 // Create and register the new vector loop. 3113 Loop *Lp = LI->AllocateLoop(); 3114 Loop *ParentLoop = OrigLoop->getParentLoop(); 3115 3116 // Insert the new loop into the loop nest and register the new basic blocks 3117 // before calling any utilities such as SCEV that require valid LoopInfo. 3118 if (ParentLoop) { 3119 ParentLoop->addChildLoop(Lp); 3120 } else { 3121 LI->addTopLevelLoop(Lp); 3122 } 3123 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3124 return Lp; 3125 } 3126 3127 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3128 Value *VectorTripCount) { 3129 assert(VectorTripCount && L && "Expected valid arguments"); 3130 // We are going to resume the execution of the scalar loop. 3131 // Go over all of the induction variables that we found and fix the 3132 // PHIs that are left in the scalar version of the loop. 3133 // The starting values of PHI nodes depend on the counter of the last 3134 // iteration in the vectorized loop. 3135 // If we come from a bypass edge then we need to start from the original 3136 // start value. 3137 for (auto &InductionEntry : Legal->getInductionVars()) { 3138 PHINode *OrigPhi = InductionEntry.first; 3139 InductionDescriptor II = InductionEntry.second; 3140 3141 // Create phi nodes to merge from the backedge-taken check block. 3142 PHINode *BCResumeVal = 3143 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3144 LoopScalarPreHeader->getTerminator()); 3145 // Copy original phi DL over to the new one. 3146 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3147 Value *&EndValue = IVEndValues[OrigPhi]; 3148 if (OrigPhi == OldInduction) { 3149 // We know what the end value is. 3150 EndValue = VectorTripCount; 3151 } else { 3152 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3153 Type *StepType = II.getStep()->getType(); 3154 Instruction::CastOps CastOp = 3155 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3156 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3157 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3158 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3159 EndValue->setName("ind.end"); 3160 } 3161 3162 // The new PHI merges the original incoming value, in case of a bypass, 3163 // or the value at the end of the vectorized loop. 3164 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3165 3166 // Fix the scalar body counter (PHI node). 3167 // The old induction's phi node in the scalar body needs the truncated 3168 // value. 3169 for (BasicBlock *BB : LoopBypassBlocks) 3170 BCResumeVal->addIncoming(II.getStartValue(), BB); 3171 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3172 } 3173 } 3174 3175 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3176 MDNode *OrigLoopID) { 3177 assert(L && "Expected valid loop."); 3178 3179 // The trip counts should be cached by now. 3180 Value *Count = getOrCreateTripCount(L); 3181 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3182 3183 // We need the OrigLoop (scalar loop part) latch terminator to help 3184 // produce correct debug info for the middle block BB instructions. 3185 // The legality check stage guarantees that the loop will have a single 3186 // latch. 3187 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3188 "Scalar loop latch terminator isn't a branch"); 3189 BranchInst *ScalarLatchBr = 3190 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3191 3192 // Add a check in the middle block to see if we have completed 3193 // all of the iterations in the first vector loop. 3194 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3195 // If tail is to be folded, we know we don't need to run the remainder. 3196 Value *CmpN = Builder.getTrue(); 3197 if (!Cost->foldTailByMasking()) { 3198 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3199 VectorTripCount, "cmp.n", 3200 LoopMiddleBlock->getTerminator()); 3201 3202 // Here we use the same DebugLoc as the scalar loop latch branch instead 3203 // of the corresponding compare because they may have ended up with 3204 // different line numbers and we want to avoid awkward line stepping while 3205 // debugging. Eg. if the compare has got a line number inside the loop. 3206 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3207 } 3208 3209 BranchInst *BrInst = 3210 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3211 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3212 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3213 3214 // Get ready to start creating new instructions into the vectorized body. 3215 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3216 "Inconsistent vector loop preheader"); 3217 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3218 3219 Optional<MDNode *> VectorizedLoopID = 3220 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3221 LLVMLoopVectorizeFollowupVectorized}); 3222 if (VectorizedLoopID.hasValue()) { 3223 L->setLoopID(VectorizedLoopID.getValue()); 3224 3225 // Do not setAlreadyVectorized if loop attributes have been defined 3226 // explicitly. 3227 return LoopVectorPreHeader; 3228 } 3229 3230 // Keep all loop hints from the original loop on the vector loop (we'll 3231 // replace the vectorizer-specific hints below). 3232 if (MDNode *LID = OrigLoop->getLoopID()) 3233 L->setLoopID(LID); 3234 3235 LoopVectorizeHints Hints(L, true, *ORE); 3236 Hints.setAlreadyVectorized(); 3237 3238 #ifdef EXPENSIVE_CHECKS 3239 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3240 LI->verify(*DT); 3241 #endif 3242 3243 return LoopVectorPreHeader; 3244 } 3245 3246 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3247 /* 3248 In this function we generate a new loop. The new loop will contain 3249 the vectorized instructions while the old loop will continue to run the 3250 scalar remainder. 3251 3252 [ ] <-- loop iteration number check. 3253 / | 3254 / v 3255 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3256 | / | 3257 | / v 3258 || [ ] <-- vector pre header. 3259 |/ | 3260 | v 3261 | [ ] \ 3262 | [ ]_| <-- vector loop. 3263 | | 3264 | v 3265 | -[ ] <--- middle-block. 3266 | / | 3267 | / v 3268 -|- >[ ] <--- new preheader. 3269 | | 3270 | v 3271 | [ ] \ 3272 | [ ]_| <-- old scalar loop to handle remainder. 3273 \ | 3274 \ v 3275 >[ ] <-- exit block. 3276 ... 3277 */ 3278 3279 // Get the metadata of the original loop before it gets modified. 3280 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3281 3282 // Create an empty vector loop, and prepare basic blocks for the runtime 3283 // checks. 3284 Loop *Lp = createVectorLoopSkeleton(""); 3285 3286 // Now, compare the new count to zero. If it is zero skip the vector loop and 3287 // jump to the scalar loop. This check also covers the case where the 3288 // backedge-taken count is uint##_max: adding one to it will overflow leading 3289 // to an incorrect trip count of zero. In this (rare) case we will also jump 3290 // to the scalar loop. 3291 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3292 3293 // Generate the code to check any assumptions that we've made for SCEV 3294 // expressions. 3295 emitSCEVChecks(Lp, LoopScalarPreHeader); 3296 3297 // Generate the code that checks in runtime if arrays overlap. We put the 3298 // checks into a separate block to make the more common case of few elements 3299 // faster. 3300 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3301 3302 // Some loops have a single integer induction variable, while other loops 3303 // don't. One example is c++ iterators that often have multiple pointer 3304 // induction variables. In the code below we also support a case where we 3305 // don't have a single induction variable. 3306 // 3307 // We try to obtain an induction variable from the original loop as hard 3308 // as possible. However if we don't find one that: 3309 // - is an integer 3310 // - counts from zero, stepping by one 3311 // - is the size of the widest induction variable type 3312 // then we create a new one. 3313 OldInduction = Legal->getPrimaryInduction(); 3314 Type *IdxTy = Legal->getWidestInductionType(); 3315 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3316 // The loop step is equal to the vectorization factor (num of SIMD elements) 3317 // times the unroll factor (num of SIMD instructions). 3318 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3319 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3320 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3321 Induction = 3322 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3323 getDebugLocFromInstOrOperands(OldInduction)); 3324 3325 // Emit phis for the new starting index of the scalar loop. 3326 createInductionResumeValues(Lp, CountRoundDown); 3327 3328 return completeLoopSkeleton(Lp, OrigLoopID); 3329 } 3330 3331 // Fix up external users of the induction variable. At this point, we are 3332 // in LCSSA form, with all external PHIs that use the IV having one input value, 3333 // coming from the remainder loop. We need those PHIs to also have a correct 3334 // value for the IV when arriving directly from the middle block. 3335 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3336 const InductionDescriptor &II, 3337 Value *CountRoundDown, Value *EndValue, 3338 BasicBlock *MiddleBlock) { 3339 // There are two kinds of external IV usages - those that use the value 3340 // computed in the last iteration (the PHI) and those that use the penultimate 3341 // value (the value that feeds into the phi from the loop latch). 3342 // We allow both, but they, obviously, have different values. 3343 3344 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3345 3346 DenseMap<Value *, Value *> MissingVals; 3347 3348 // An external user of the last iteration's value should see the value that 3349 // the remainder loop uses to initialize its own IV. 3350 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3351 for (User *U : PostInc->users()) { 3352 Instruction *UI = cast<Instruction>(U); 3353 if (!OrigLoop->contains(UI)) { 3354 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3355 MissingVals[UI] = EndValue; 3356 } 3357 } 3358 3359 // An external user of the penultimate value need to see EndValue - Step. 3360 // The simplest way to get this is to recompute it from the constituent SCEVs, 3361 // that is Start + (Step * (CRD - 1)). 3362 for (User *U : OrigPhi->users()) { 3363 auto *UI = cast<Instruction>(U); 3364 if (!OrigLoop->contains(UI)) { 3365 const DataLayout &DL = 3366 OrigLoop->getHeader()->getModule()->getDataLayout(); 3367 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3368 3369 IRBuilder<> B(MiddleBlock->getTerminator()); 3370 Value *CountMinusOne = B.CreateSub( 3371 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3372 Value *CMO = 3373 !II.getStep()->getType()->isIntegerTy() 3374 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3375 II.getStep()->getType()) 3376 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3377 CMO->setName("cast.cmo"); 3378 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3379 Escape->setName("ind.escape"); 3380 MissingVals[UI] = Escape; 3381 } 3382 } 3383 3384 for (auto &I : MissingVals) { 3385 PHINode *PHI = cast<PHINode>(I.first); 3386 // One corner case we have to handle is two IVs "chasing" each-other, 3387 // that is %IV2 = phi [...], [ %IV1, %latch ] 3388 // In this case, if IV1 has an external use, we need to avoid adding both 3389 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3390 // don't already have an incoming value for the middle block. 3391 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3392 PHI->addIncoming(I.second, MiddleBlock); 3393 } 3394 } 3395 3396 namespace { 3397 3398 struct CSEDenseMapInfo { 3399 static bool canHandle(const Instruction *I) { 3400 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3401 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3402 } 3403 3404 static inline Instruction *getEmptyKey() { 3405 return DenseMapInfo<Instruction *>::getEmptyKey(); 3406 } 3407 3408 static inline Instruction *getTombstoneKey() { 3409 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3410 } 3411 3412 static unsigned getHashValue(const Instruction *I) { 3413 assert(canHandle(I) && "Unknown instruction!"); 3414 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3415 I->value_op_end())); 3416 } 3417 3418 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3419 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3420 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3421 return LHS == RHS; 3422 return LHS->isIdenticalTo(RHS); 3423 } 3424 }; 3425 3426 } // end anonymous namespace 3427 3428 ///Perform cse of induction variable instructions. 3429 static void cse(BasicBlock *BB) { 3430 // Perform simple cse. 3431 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3432 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3433 Instruction *In = &*I++; 3434 3435 if (!CSEDenseMapInfo::canHandle(In)) 3436 continue; 3437 3438 // Check if we can replace this instruction with any of the 3439 // visited instructions. 3440 if (Instruction *V = CSEMap.lookup(In)) { 3441 In->replaceAllUsesWith(V); 3442 In->eraseFromParent(); 3443 continue; 3444 } 3445 3446 CSEMap[In] = In; 3447 } 3448 } 3449 3450 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3451 ElementCount VF, 3452 bool &NeedToScalarize) { 3453 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3454 Function *F = CI->getCalledFunction(); 3455 Type *ScalarRetTy = CI->getType(); 3456 SmallVector<Type *, 4> Tys, ScalarTys; 3457 for (auto &ArgOp : CI->arg_operands()) 3458 ScalarTys.push_back(ArgOp->getType()); 3459 3460 // Estimate cost of scalarized vector call. The source operands are assumed 3461 // to be vectors, so we need to extract individual elements from there, 3462 // execute VF scalar calls, and then gather the result into the vector return 3463 // value. 3464 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3465 TTI::TCK_RecipThroughput); 3466 if (VF.isScalar()) 3467 return ScalarCallCost; 3468 3469 // Compute corresponding vector type for return value and arguments. 3470 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3471 for (Type *ScalarTy : ScalarTys) 3472 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3473 3474 // Compute costs of unpacking argument values for the scalar calls and 3475 // packing the return values to a vector. 3476 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3477 3478 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3479 3480 // If we can't emit a vector call for this function, then the currently found 3481 // cost is the cost we need to return. 3482 NeedToScalarize = true; 3483 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3484 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3485 3486 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3487 return Cost; 3488 3489 // If the corresponding vector cost is cheaper, return its cost. 3490 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3491 TTI::TCK_RecipThroughput); 3492 if (VectorCallCost < Cost) { 3493 NeedToScalarize = false; 3494 return VectorCallCost; 3495 } 3496 return Cost; 3497 } 3498 3499 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3500 ElementCount VF) { 3501 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3502 assert(ID && "Expected intrinsic call!"); 3503 3504 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3505 return TTI.getIntrinsicInstrCost(CostAttrs, 3506 TargetTransformInfo::TCK_RecipThroughput); 3507 } 3508 3509 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3510 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3511 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3512 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3513 } 3514 3515 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3516 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3517 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3518 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3519 } 3520 3521 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3522 // For every instruction `I` in MinBWs, truncate the operands, create a 3523 // truncated version of `I` and reextend its result. InstCombine runs 3524 // later and will remove any ext/trunc pairs. 3525 SmallPtrSet<Value *, 4> Erased; 3526 for (const auto &KV : Cost->getMinimalBitwidths()) { 3527 // If the value wasn't vectorized, we must maintain the original scalar 3528 // type. The absence of the value from VectorLoopValueMap indicates that it 3529 // wasn't vectorized. 3530 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3531 continue; 3532 for (unsigned Part = 0; Part < UF; ++Part) { 3533 Value *I = getOrCreateVectorValue(KV.first, Part); 3534 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3535 continue; 3536 Type *OriginalTy = I->getType(); 3537 Type *ScalarTruncatedTy = 3538 IntegerType::get(OriginalTy->getContext(), KV.second); 3539 auto *TruncatedTy = FixedVectorType::get( 3540 ScalarTruncatedTy, 3541 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3542 if (TruncatedTy == OriginalTy) 3543 continue; 3544 3545 IRBuilder<> B(cast<Instruction>(I)); 3546 auto ShrinkOperand = [&](Value *V) -> Value * { 3547 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3548 if (ZI->getSrcTy() == TruncatedTy) 3549 return ZI->getOperand(0); 3550 return B.CreateZExtOrTrunc(V, TruncatedTy); 3551 }; 3552 3553 // The actual instruction modification depends on the instruction type, 3554 // unfortunately. 3555 Value *NewI = nullptr; 3556 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3557 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3558 ShrinkOperand(BO->getOperand(1))); 3559 3560 // Any wrapping introduced by shrinking this operation shouldn't be 3561 // considered undefined behavior. So, we can't unconditionally copy 3562 // arithmetic wrapping flags to NewI. 3563 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3564 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3565 NewI = 3566 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3567 ShrinkOperand(CI->getOperand(1))); 3568 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3569 NewI = B.CreateSelect(SI->getCondition(), 3570 ShrinkOperand(SI->getTrueValue()), 3571 ShrinkOperand(SI->getFalseValue())); 3572 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3573 switch (CI->getOpcode()) { 3574 default: 3575 llvm_unreachable("Unhandled cast!"); 3576 case Instruction::Trunc: 3577 NewI = ShrinkOperand(CI->getOperand(0)); 3578 break; 3579 case Instruction::SExt: 3580 NewI = B.CreateSExtOrTrunc( 3581 CI->getOperand(0), 3582 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3583 break; 3584 case Instruction::ZExt: 3585 NewI = B.CreateZExtOrTrunc( 3586 CI->getOperand(0), 3587 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3588 break; 3589 } 3590 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3591 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3592 ->getNumElements(); 3593 auto *O0 = B.CreateZExtOrTrunc( 3594 SI->getOperand(0), 3595 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3596 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3597 ->getNumElements(); 3598 auto *O1 = B.CreateZExtOrTrunc( 3599 SI->getOperand(1), 3600 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3601 3602 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3603 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3604 // Don't do anything with the operands, just extend the result. 3605 continue; 3606 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3607 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3608 ->getNumElements(); 3609 auto *O0 = B.CreateZExtOrTrunc( 3610 IE->getOperand(0), 3611 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3612 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3613 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3614 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3615 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3616 ->getNumElements(); 3617 auto *O0 = B.CreateZExtOrTrunc( 3618 EE->getOperand(0), 3619 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3620 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3621 } else { 3622 // If we don't know what to do, be conservative and don't do anything. 3623 continue; 3624 } 3625 3626 // Lastly, extend the result. 3627 NewI->takeName(cast<Instruction>(I)); 3628 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3629 I->replaceAllUsesWith(Res); 3630 cast<Instruction>(I)->eraseFromParent(); 3631 Erased.insert(I); 3632 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3633 } 3634 } 3635 3636 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3637 for (const auto &KV : Cost->getMinimalBitwidths()) { 3638 // If the value wasn't vectorized, we must maintain the original scalar 3639 // type. The absence of the value from VectorLoopValueMap indicates that it 3640 // wasn't vectorized. 3641 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3642 continue; 3643 for (unsigned Part = 0; Part < UF; ++Part) { 3644 Value *I = getOrCreateVectorValue(KV.first, Part); 3645 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3646 if (Inst && Inst->use_empty()) { 3647 Value *NewI = Inst->getOperand(0); 3648 Inst->eraseFromParent(); 3649 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3650 } 3651 } 3652 } 3653 } 3654 3655 void InnerLoopVectorizer::fixVectorizedLoop() { 3656 // Insert truncates and extends for any truncated instructions as hints to 3657 // InstCombine. 3658 if (VF.isVector()) 3659 truncateToMinimalBitwidths(); 3660 3661 // Fix widened non-induction PHIs by setting up the PHI operands. 3662 if (OrigPHIsToFix.size()) { 3663 assert(EnableVPlanNativePath && 3664 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3665 fixNonInductionPHIs(); 3666 } 3667 3668 // At this point every instruction in the original loop is widened to a 3669 // vector form. Now we need to fix the recurrences in the loop. These PHI 3670 // nodes are currently empty because we did not want to introduce cycles. 3671 // This is the second stage of vectorizing recurrences. 3672 fixCrossIterationPHIs(); 3673 3674 // Forget the original basic block. 3675 PSE.getSE()->forgetLoop(OrigLoop); 3676 3677 // Fix-up external users of the induction variables. 3678 for (auto &Entry : Legal->getInductionVars()) 3679 fixupIVUsers(Entry.first, Entry.second, 3680 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3681 IVEndValues[Entry.first], LoopMiddleBlock); 3682 3683 fixLCSSAPHIs(); 3684 for (Instruction *PI : PredicatedInstructions) 3685 sinkScalarOperands(&*PI); 3686 3687 // Remove redundant induction instructions. 3688 cse(LoopVectorBody); 3689 3690 // Set/update profile weights for the vector and remainder loops as original 3691 // loop iterations are now distributed among them. Note that original loop 3692 // represented by LoopScalarBody becomes remainder loop after vectorization. 3693 // 3694 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3695 // end up getting slightly roughened result but that should be OK since 3696 // profile is not inherently precise anyway. Note also possible bypass of 3697 // vector code caused by legality checks is ignored, assigning all the weight 3698 // to the vector loop, optimistically. 3699 assert(!VF.isScalable() && 3700 "cannot use scalable ElementCount to determine unroll factor"); 3701 setProfileInfoAfterUnrolling( 3702 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3703 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3704 } 3705 3706 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3707 // In order to support recurrences we need to be able to vectorize Phi nodes. 3708 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3709 // stage #2: We now need to fix the recurrences by adding incoming edges to 3710 // the currently empty PHI nodes. At this point every instruction in the 3711 // original loop is widened to a vector form so we can use them to construct 3712 // the incoming edges. 3713 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3714 // Handle first-order recurrences and reductions that need to be fixed. 3715 if (Legal->isFirstOrderRecurrence(&Phi)) 3716 fixFirstOrderRecurrence(&Phi); 3717 else if (Legal->isReductionVariable(&Phi)) 3718 fixReduction(&Phi); 3719 } 3720 } 3721 3722 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3723 // This is the second phase of vectorizing first-order recurrences. An 3724 // overview of the transformation is described below. Suppose we have the 3725 // following loop. 3726 // 3727 // for (int i = 0; i < n; ++i) 3728 // b[i] = a[i] - a[i - 1]; 3729 // 3730 // There is a first-order recurrence on "a". For this loop, the shorthand 3731 // scalar IR looks like: 3732 // 3733 // scalar.ph: 3734 // s_init = a[-1] 3735 // br scalar.body 3736 // 3737 // scalar.body: 3738 // i = phi [0, scalar.ph], [i+1, scalar.body] 3739 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3740 // s2 = a[i] 3741 // b[i] = s2 - s1 3742 // br cond, scalar.body, ... 3743 // 3744 // In this example, s1 is a recurrence because it's value depends on the 3745 // previous iteration. In the first phase of vectorization, we created a 3746 // temporary value for s1. We now complete the vectorization and produce the 3747 // shorthand vector IR shown below (for VF = 4, UF = 1). 3748 // 3749 // vector.ph: 3750 // v_init = vector(..., ..., ..., a[-1]) 3751 // br vector.body 3752 // 3753 // vector.body 3754 // i = phi [0, vector.ph], [i+4, vector.body] 3755 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3756 // v2 = a[i, i+1, i+2, i+3]; 3757 // v3 = vector(v1(3), v2(0, 1, 2)) 3758 // b[i, i+1, i+2, i+3] = v2 - v3 3759 // br cond, vector.body, middle.block 3760 // 3761 // middle.block: 3762 // x = v2(3) 3763 // br scalar.ph 3764 // 3765 // scalar.ph: 3766 // s_init = phi [x, middle.block], [a[-1], otherwise] 3767 // br scalar.body 3768 // 3769 // After execution completes the vector loop, we extract the next value of 3770 // the recurrence (x) to use as the initial value in the scalar loop. 3771 3772 // Get the original loop preheader and single loop latch. 3773 auto *Preheader = OrigLoop->getLoopPreheader(); 3774 auto *Latch = OrigLoop->getLoopLatch(); 3775 3776 // Get the initial and previous values of the scalar recurrence. 3777 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3778 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3779 3780 // Create a vector from the initial value. 3781 auto *VectorInit = ScalarInit; 3782 if (VF.isVector()) { 3783 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3784 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3785 VectorInit = Builder.CreateInsertElement( 3786 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3787 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3788 } 3789 3790 // We constructed a temporary phi node in the first phase of vectorization. 3791 // This phi node will eventually be deleted. 3792 Builder.SetInsertPoint( 3793 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3794 3795 // Create a phi node for the new recurrence. The current value will either be 3796 // the initial value inserted into a vector or loop-varying vector value. 3797 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3798 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3799 3800 // Get the vectorized previous value of the last part UF - 1. It appears last 3801 // among all unrolled iterations, due to the order of their construction. 3802 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3803 3804 // Find and set the insertion point after the previous value if it is an 3805 // instruction. 3806 BasicBlock::iterator InsertPt; 3807 // Note that the previous value may have been constant-folded so it is not 3808 // guaranteed to be an instruction in the vector loop. 3809 // FIXME: Loop invariant values do not form recurrences. We should deal with 3810 // them earlier. 3811 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3812 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3813 else { 3814 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3815 if (isa<PHINode>(PreviousLastPart)) 3816 // If the previous value is a phi node, we should insert after all the phi 3817 // nodes in the block containing the PHI to avoid breaking basic block 3818 // verification. Note that the basic block may be different to 3819 // LoopVectorBody, in case we predicate the loop. 3820 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3821 else 3822 InsertPt = ++PreviousInst->getIterator(); 3823 } 3824 Builder.SetInsertPoint(&*InsertPt); 3825 3826 // We will construct a vector for the recurrence by combining the values for 3827 // the current and previous iterations. This is the required shuffle mask. 3828 assert(!VF.isScalable()); 3829 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3830 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3831 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3832 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3833 3834 // The vector from which to take the initial value for the current iteration 3835 // (actual or unrolled). Initially, this is the vector phi node. 3836 Value *Incoming = VecPhi; 3837 3838 // Shuffle the current and previous vector and update the vector parts. 3839 for (unsigned Part = 0; Part < UF; ++Part) { 3840 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3841 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3842 auto *Shuffle = 3843 VF.isVector() 3844 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3845 : Incoming; 3846 PhiPart->replaceAllUsesWith(Shuffle); 3847 cast<Instruction>(PhiPart)->eraseFromParent(); 3848 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3849 Incoming = PreviousPart; 3850 } 3851 3852 // Fix the latch value of the new recurrence in the vector loop. 3853 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3854 3855 // Extract the last vector element in the middle block. This will be the 3856 // initial value for the recurrence when jumping to the scalar loop. 3857 auto *ExtractForScalar = Incoming; 3858 if (VF.isVector()) { 3859 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3860 ExtractForScalar = Builder.CreateExtractElement( 3861 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3862 "vector.recur.extract"); 3863 } 3864 // Extract the second last element in the middle block if the 3865 // Phi is used outside the loop. We need to extract the phi itself 3866 // and not the last element (the phi update in the current iteration). This 3867 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3868 // when the scalar loop is not run at all. 3869 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3870 if (VF.isVector()) 3871 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3872 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3873 "vector.recur.extract.for.phi"); 3874 // When loop is unrolled without vectorizing, initialize 3875 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3876 // `Incoming`. This is analogous to the vectorized case above: extracting the 3877 // second last element when VF > 1. 3878 else if (UF > 1) 3879 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3880 3881 // Fix the initial value of the original recurrence in the scalar loop. 3882 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3883 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3884 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3885 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3886 Start->addIncoming(Incoming, BB); 3887 } 3888 3889 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3890 Phi->setName("scalar.recur"); 3891 3892 // Finally, fix users of the recurrence outside the loop. The users will need 3893 // either the last value of the scalar recurrence or the last value of the 3894 // vector recurrence we extracted in the middle block. Since the loop is in 3895 // LCSSA form, we just need to find all the phi nodes for the original scalar 3896 // recurrence in the exit block, and then add an edge for the middle block. 3897 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3898 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3899 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3900 } 3901 } 3902 } 3903 3904 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3905 Constant *Zero = Builder.getInt32(0); 3906 3907 // Get it's reduction variable descriptor. 3908 assert(Legal->isReductionVariable(Phi) && 3909 "Unable to find the reduction variable"); 3910 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3911 3912 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3913 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3914 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3915 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3916 RdxDesc.getMinMaxRecurrenceKind(); 3917 setDebugLocFromInst(Builder, ReductionStartValue); 3918 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3919 3920 // We need to generate a reduction vector from the incoming scalar. 3921 // To do so, we need to generate the 'identity' vector and override 3922 // one of the elements with the incoming scalar reduction. We need 3923 // to do it in the vector-loop preheader. 3924 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3925 3926 // This is the vector-clone of the value that leaves the loop. 3927 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3928 3929 // Find the reduction identity variable. Zero for addition, or, xor, 3930 // one for multiplication, -1 for And. 3931 Value *Identity; 3932 Value *VectorStart; 3933 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3934 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3935 // MinMax reduction have the start value as their identify. 3936 if (VF == 1 || IsInLoopReductionPhi) { 3937 VectorStart = Identity = ReductionStartValue; 3938 } else { 3939 VectorStart = Identity = 3940 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3941 } 3942 } else { 3943 // Handle other reduction kinds: 3944 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3945 RK, VecTy->getScalarType()); 3946 if (VF == 1 || IsInLoopReductionPhi) { 3947 Identity = Iden; 3948 // This vector is the Identity vector where the first element is the 3949 // incoming scalar reduction. 3950 VectorStart = ReductionStartValue; 3951 } else { 3952 Identity = ConstantVector::getSplat(VF, Iden); 3953 3954 // This vector is the Identity vector where the first element is the 3955 // incoming scalar reduction. 3956 VectorStart = 3957 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3958 } 3959 } 3960 3961 // Wrap flags are in general invalid after vectorization, clear them. 3962 clearReductionWrapFlags(RdxDesc); 3963 3964 // Fix the vector-loop phi. 3965 3966 // Reductions do not have to start at zero. They can start with 3967 // any loop invariant values. 3968 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3969 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3970 3971 for (unsigned Part = 0; Part < UF; ++Part) { 3972 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3973 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3974 // Make sure to add the reduction start value only to the 3975 // first unroll part. 3976 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3977 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3978 cast<PHINode>(VecRdxPhi) 3979 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3980 } 3981 3982 // Before each round, move the insertion point right between 3983 // the PHIs and the values we are going to write. 3984 // This allows us to write both PHINodes and the extractelement 3985 // instructions. 3986 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3987 3988 setDebugLocFromInst(Builder, LoopExitInst); 3989 3990 // If tail is folded by masking, the vector value to leave the loop should be 3991 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3992 // instead of the former. 3993 if (Cost->foldTailByMasking()) { 3994 for (unsigned Part = 0; Part < UF; ++Part) { 3995 Value *VecLoopExitInst = 3996 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3997 Value *Sel = nullptr; 3998 for (User *U : VecLoopExitInst->users()) { 3999 if (isa<SelectInst>(U)) { 4000 assert(!Sel && "Reduction exit feeding two selects"); 4001 Sel = U; 4002 } else 4003 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4004 } 4005 assert(Sel && "Reduction exit feeds no select"); 4006 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4007 4008 // If the target can create a predicated operator for the reduction at no 4009 // extra cost in the loop (for example a predicated vadd), it can be 4010 // cheaper for the select to remain in the loop than be sunk out of it, 4011 // and so use the select value for the phi instead of the old 4012 // LoopExitValue. 4013 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4014 if (PreferPredicatedReductionSelect || 4015 TTI->preferPredicatedReductionSelect( 4016 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4017 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4018 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4019 VecRdxPhi->setIncomingValueForBlock( 4020 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4021 } 4022 } 4023 } 4024 4025 // If the vector reduction can be performed in a smaller type, we truncate 4026 // then extend the loop exit value to enable InstCombine to evaluate the 4027 // entire expression in the smaller type. 4028 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4029 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4030 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4031 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4032 Builder.SetInsertPoint( 4033 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4034 VectorParts RdxParts(UF); 4035 for (unsigned Part = 0; Part < UF; ++Part) { 4036 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4037 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4038 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4039 : Builder.CreateZExt(Trunc, VecTy); 4040 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4041 UI != RdxParts[Part]->user_end();) 4042 if (*UI != Trunc) { 4043 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4044 RdxParts[Part] = Extnd; 4045 } else { 4046 ++UI; 4047 } 4048 } 4049 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4050 for (unsigned Part = 0; Part < UF; ++Part) { 4051 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4052 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4053 } 4054 } 4055 4056 // Reduce all of the unrolled parts into a single vector. 4057 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4058 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4059 4060 // The middle block terminator has already been assigned a DebugLoc here (the 4061 // OrigLoop's single latch terminator). We want the whole middle block to 4062 // appear to execute on this line because: (a) it is all compiler generated, 4063 // (b) these instructions are always executed after evaluating the latch 4064 // conditional branch, and (c) other passes may add new predecessors which 4065 // terminate on this line. This is the easiest way to ensure we don't 4066 // accidentally cause an extra step back into the loop while debugging. 4067 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4068 for (unsigned Part = 1; Part < UF; ++Part) { 4069 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4070 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4071 // Floating point operations had to be 'fast' to enable the reduction. 4072 ReducedPartRdx = addFastMathFlag( 4073 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4074 ReducedPartRdx, "bin.rdx"), 4075 RdxDesc.getFastMathFlags()); 4076 else 4077 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4078 RdxPart); 4079 } 4080 4081 // Create the reduction after the loop. Note that inloop reductions create the 4082 // target reduction in the loop using a Reduction recipe. 4083 if (VF.isVector() && !IsInLoopReductionPhi) { 4084 bool NoNaN = Legal->hasFunNoNaNAttr(); 4085 ReducedPartRdx = 4086 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4087 // If the reduction can be performed in a smaller type, we need to extend 4088 // the reduction to the wider type before we branch to the original loop. 4089 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4090 ReducedPartRdx = 4091 RdxDesc.isSigned() 4092 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4093 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4094 } 4095 4096 // Create a phi node that merges control-flow from the backedge-taken check 4097 // block and the middle block. 4098 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4099 LoopScalarPreHeader->getTerminator()); 4100 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4101 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4102 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4103 4104 // Now, we need to fix the users of the reduction variable 4105 // inside and outside of the scalar remainder loop. 4106 // We know that the loop is in LCSSA form. We need to update the 4107 // PHI nodes in the exit blocks. 4108 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4109 // All PHINodes need to have a single entry edge, or two if 4110 // we already fixed them. 4111 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4112 4113 // We found a reduction value exit-PHI. Update it with the 4114 // incoming bypass edge. 4115 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4116 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4117 } // end of the LCSSA phi scan. 4118 4119 // Fix the scalar loop reduction variable with the incoming reduction sum 4120 // from the vector body and from the backedge value. 4121 int IncomingEdgeBlockIdx = 4122 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4123 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4124 // Pick the other block. 4125 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4126 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4127 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4128 } 4129 4130 void InnerLoopVectorizer::clearReductionWrapFlags( 4131 RecurrenceDescriptor &RdxDesc) { 4132 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4133 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4134 RK != RecurrenceDescriptor::RK_IntegerMult) 4135 return; 4136 4137 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4138 assert(LoopExitInstr && "null loop exit instruction"); 4139 SmallVector<Instruction *, 8> Worklist; 4140 SmallPtrSet<Instruction *, 8> Visited; 4141 Worklist.push_back(LoopExitInstr); 4142 Visited.insert(LoopExitInstr); 4143 4144 while (!Worklist.empty()) { 4145 Instruction *Cur = Worklist.pop_back_val(); 4146 if (isa<OverflowingBinaryOperator>(Cur)) 4147 for (unsigned Part = 0; Part < UF; ++Part) { 4148 Value *V = getOrCreateVectorValue(Cur, Part); 4149 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4150 } 4151 4152 for (User *U : Cur->users()) { 4153 Instruction *UI = cast<Instruction>(U); 4154 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4155 Visited.insert(UI).second) 4156 Worklist.push_back(UI); 4157 } 4158 } 4159 } 4160 4161 void InnerLoopVectorizer::fixLCSSAPHIs() { 4162 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4163 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4164 if (LCSSAPhi.getNumIncomingValues() == 1) { 4165 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4166 // Non-instruction incoming values will have only one value. 4167 unsigned LastLane = 0; 4168 if (isa<Instruction>(IncomingValue)) 4169 LastLane = Cost->isUniformAfterVectorization( 4170 cast<Instruction>(IncomingValue), VF) 4171 ? 0 4172 : VF.getKnownMinValue() - 1; 4173 // Can be a loop invariant incoming value or the last scalar value to be 4174 // extracted from the vectorized loop. 4175 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4176 Value *lastIncomingValue = 4177 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4178 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4179 } 4180 } 4181 } 4182 4183 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4184 // The basic block and loop containing the predicated instruction. 4185 auto *PredBB = PredInst->getParent(); 4186 auto *VectorLoop = LI->getLoopFor(PredBB); 4187 4188 // Initialize a worklist with the operands of the predicated instruction. 4189 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4190 4191 // Holds instructions that we need to analyze again. An instruction may be 4192 // reanalyzed if we don't yet know if we can sink it or not. 4193 SmallVector<Instruction *, 8> InstsToReanalyze; 4194 4195 // Returns true if a given use occurs in the predicated block. Phi nodes use 4196 // their operands in their corresponding predecessor blocks. 4197 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4198 auto *I = cast<Instruction>(U.getUser()); 4199 BasicBlock *BB = I->getParent(); 4200 if (auto *Phi = dyn_cast<PHINode>(I)) 4201 BB = Phi->getIncomingBlock( 4202 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4203 return BB == PredBB; 4204 }; 4205 4206 // Iteratively sink the scalarized operands of the predicated instruction 4207 // into the block we created for it. When an instruction is sunk, it's 4208 // operands are then added to the worklist. The algorithm ends after one pass 4209 // through the worklist doesn't sink a single instruction. 4210 bool Changed; 4211 do { 4212 // Add the instructions that need to be reanalyzed to the worklist, and 4213 // reset the changed indicator. 4214 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4215 InstsToReanalyze.clear(); 4216 Changed = false; 4217 4218 while (!Worklist.empty()) { 4219 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4220 4221 // We can't sink an instruction if it is a phi node, is already in the 4222 // predicated block, is not in the loop, or may have side effects. 4223 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4224 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4225 continue; 4226 4227 // It's legal to sink the instruction if all its uses occur in the 4228 // predicated block. Otherwise, there's nothing to do yet, and we may 4229 // need to reanalyze the instruction. 4230 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4231 InstsToReanalyze.push_back(I); 4232 continue; 4233 } 4234 4235 // Move the instruction to the beginning of the predicated block, and add 4236 // it's operands to the worklist. 4237 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4238 Worklist.insert(I->op_begin(), I->op_end()); 4239 4240 // The sinking may have enabled other instructions to be sunk, so we will 4241 // need to iterate. 4242 Changed = true; 4243 } 4244 } while (Changed); 4245 } 4246 4247 void InnerLoopVectorizer::fixNonInductionPHIs() { 4248 for (PHINode *OrigPhi : OrigPHIsToFix) { 4249 PHINode *NewPhi = 4250 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4251 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4252 4253 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4254 predecessors(OrigPhi->getParent())); 4255 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4256 predecessors(NewPhi->getParent())); 4257 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4258 "Scalar and Vector BB should have the same number of predecessors"); 4259 4260 // The insertion point in Builder may be invalidated by the time we get 4261 // here. Force the Builder insertion point to something valid so that we do 4262 // not run into issues during insertion point restore in 4263 // getOrCreateVectorValue calls below. 4264 Builder.SetInsertPoint(NewPhi); 4265 4266 // The predecessor order is preserved and we can rely on mapping between 4267 // scalar and vector block predecessors. 4268 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4269 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4270 4271 // When looking up the new scalar/vector values to fix up, use incoming 4272 // values from original phi. 4273 Value *ScIncV = 4274 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4275 4276 // Scalar incoming value may need a broadcast 4277 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4278 NewPhi->addIncoming(NewIncV, NewPredBB); 4279 } 4280 } 4281 } 4282 4283 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4284 unsigned UF, ElementCount VF, 4285 bool IsPtrLoopInvariant, 4286 SmallBitVector &IsIndexLoopInvariant, 4287 VPTransformState &State) { 4288 // Construct a vector GEP by widening the operands of the scalar GEP as 4289 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4290 // results in a vector of pointers when at least one operand of the GEP 4291 // is vector-typed. Thus, to keep the representation compact, we only use 4292 // vector-typed operands for loop-varying values. 4293 4294 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4295 // If we are vectorizing, but the GEP has only loop-invariant operands, 4296 // the GEP we build (by only using vector-typed operands for 4297 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4298 // produce a vector of pointers, we need to either arbitrarily pick an 4299 // operand to broadcast, or broadcast a clone of the original GEP. 4300 // Here, we broadcast a clone of the original. 4301 // 4302 // TODO: If at some point we decide to scalarize instructions having 4303 // loop-invariant operands, this special case will no longer be 4304 // required. We would add the scalarization decision to 4305 // collectLoopScalars() and teach getVectorValue() to broadcast 4306 // the lane-zero scalar value. 4307 auto *Clone = Builder.Insert(GEP->clone()); 4308 for (unsigned Part = 0; Part < UF; ++Part) { 4309 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4310 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4311 addMetadata(EntryPart, GEP); 4312 } 4313 } else { 4314 // If the GEP has at least one loop-varying operand, we are sure to 4315 // produce a vector of pointers. But if we are only unrolling, we want 4316 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4317 // produce with the code below will be scalar (if VF == 1) or vector 4318 // (otherwise). Note that for the unroll-only case, we still maintain 4319 // values in the vector mapping with initVector, as we do for other 4320 // instructions. 4321 for (unsigned Part = 0; Part < UF; ++Part) { 4322 // The pointer operand of the new GEP. If it's loop-invariant, we 4323 // won't broadcast it. 4324 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4325 : State.get(Operands.getOperand(0), Part); 4326 4327 // Collect all the indices for the new GEP. If any index is 4328 // loop-invariant, we won't broadcast it. 4329 SmallVector<Value *, 4> Indices; 4330 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4331 VPValue *Operand = Operands.getOperand(I); 4332 if (IsIndexLoopInvariant[I - 1]) 4333 Indices.push_back(State.get(Operand, {0, 0})); 4334 else 4335 Indices.push_back(State.get(Operand, Part)); 4336 } 4337 4338 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4339 // but it should be a vector, otherwise. 4340 auto *NewGEP = 4341 GEP->isInBounds() 4342 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4343 Indices) 4344 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4345 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4346 "NewGEP is not a pointer vector"); 4347 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4348 addMetadata(NewGEP, GEP); 4349 } 4350 } 4351 } 4352 4353 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4354 ElementCount VF) { 4355 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4356 PHINode *P = cast<PHINode>(PN); 4357 if (EnableVPlanNativePath) { 4358 // Currently we enter here in the VPlan-native path for non-induction 4359 // PHIs where all control flow is uniform. We simply widen these PHIs. 4360 // Create a vector phi with no operands - the vector phi operands will be 4361 // set at the end of vector code generation. 4362 Type *VecTy = 4363 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4364 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4365 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4366 OrigPHIsToFix.push_back(P); 4367 4368 return; 4369 } 4370 4371 assert(PN->getParent() == OrigLoop->getHeader() && 4372 "Non-header phis should have been handled elsewhere"); 4373 4374 // In order to support recurrences we need to be able to vectorize Phi nodes. 4375 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4376 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4377 // this value when we vectorize all of the instructions that use the PHI. 4378 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4379 for (unsigned Part = 0; Part < UF; ++Part) { 4380 // This is phase one of vectorizing PHIs. 4381 bool ScalarPHI = 4382 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4383 Type *VecTy = 4384 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4385 Value *EntryPart = PHINode::Create( 4386 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4387 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4388 } 4389 return; 4390 } 4391 4392 setDebugLocFromInst(Builder, P); 4393 4394 // This PHINode must be an induction variable. 4395 // Make sure that we know about it. 4396 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4397 4398 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4399 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4400 4401 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4402 // which can be found from the original scalar operations. 4403 switch (II.getKind()) { 4404 case InductionDescriptor::IK_NoInduction: 4405 llvm_unreachable("Unknown induction"); 4406 case InductionDescriptor::IK_IntInduction: 4407 case InductionDescriptor::IK_FpInduction: 4408 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4409 case InductionDescriptor::IK_PtrInduction: { 4410 // Handle the pointer induction variable case. 4411 assert(P->getType()->isPointerTy() && "Unexpected type."); 4412 4413 if (Cost->isScalarAfterVectorization(P, VF)) { 4414 // This is the normalized GEP that starts counting at zero. 4415 Value *PtrInd = 4416 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4417 // Determine the number of scalars we need to generate for each unroll 4418 // iteration. If the instruction is uniform, we only need to generate the 4419 // first lane. Otherwise, we generate all VF values. 4420 unsigned Lanes = 4421 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4422 for (unsigned Part = 0; Part < UF; ++Part) { 4423 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4424 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4425 Lane + Part * VF.getKnownMinValue()); 4426 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4427 Value *SclrGep = 4428 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4429 SclrGep->setName("next.gep"); 4430 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4431 } 4432 } 4433 return; 4434 } 4435 assert(isa<SCEVConstant>(II.getStep()) && 4436 "Induction step not a SCEV constant!"); 4437 Type *PhiType = II.getStep()->getType(); 4438 4439 // Build a pointer phi 4440 Value *ScalarStartValue = II.getStartValue(); 4441 Type *ScStValueType = ScalarStartValue->getType(); 4442 PHINode *NewPointerPhi = 4443 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4444 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4445 4446 // A pointer induction, performed by using a gep 4447 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4448 Instruction *InductionLoc = LoopLatch->getTerminator(); 4449 const SCEV *ScalarStep = II.getStep(); 4450 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4451 Value *ScalarStepValue = 4452 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4453 Value *InductionGEP = GetElementPtrInst::Create( 4454 ScStValueType->getPointerElementType(), NewPointerPhi, 4455 Builder.CreateMul( 4456 ScalarStepValue, 4457 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4458 "ptr.ind", InductionLoc); 4459 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4460 4461 // Create UF many actual address geps that use the pointer 4462 // phi as base and a vectorized version of the step value 4463 // (<step*0, ..., step*N>) as offset. 4464 for (unsigned Part = 0; Part < UF; ++Part) { 4465 SmallVector<Constant *, 8> Indices; 4466 // Create a vector of consecutive numbers from zero to VF. 4467 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4468 Indices.push_back( 4469 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4470 Constant *StartOffset = ConstantVector::get(Indices); 4471 4472 Value *GEP = Builder.CreateGEP( 4473 ScStValueType->getPointerElementType(), NewPointerPhi, 4474 Builder.CreateMul( 4475 StartOffset, 4476 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4477 "vector.gep")); 4478 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4479 } 4480 } 4481 } 4482 } 4483 4484 /// A helper function for checking whether an integer division-related 4485 /// instruction may divide by zero (in which case it must be predicated if 4486 /// executed conditionally in the scalar code). 4487 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4488 /// Non-zero divisors that are non compile-time constants will not be 4489 /// converted into multiplication, so we will still end up scalarizing 4490 /// the division, but can do so w/o predication. 4491 static bool mayDivideByZero(Instruction &I) { 4492 assert((I.getOpcode() == Instruction::UDiv || 4493 I.getOpcode() == Instruction::SDiv || 4494 I.getOpcode() == Instruction::URem || 4495 I.getOpcode() == Instruction::SRem) && 4496 "Unexpected instruction"); 4497 Value *Divisor = I.getOperand(1); 4498 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4499 return !CInt || CInt->isZero(); 4500 } 4501 4502 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4503 VPTransformState &State) { 4504 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4505 switch (I.getOpcode()) { 4506 case Instruction::Call: 4507 case Instruction::Br: 4508 case Instruction::PHI: 4509 case Instruction::GetElementPtr: 4510 case Instruction::Select: 4511 llvm_unreachable("This instruction is handled by a different recipe."); 4512 case Instruction::UDiv: 4513 case Instruction::SDiv: 4514 case Instruction::SRem: 4515 case Instruction::URem: 4516 case Instruction::Add: 4517 case Instruction::FAdd: 4518 case Instruction::Sub: 4519 case Instruction::FSub: 4520 case Instruction::FNeg: 4521 case Instruction::Mul: 4522 case Instruction::FMul: 4523 case Instruction::FDiv: 4524 case Instruction::FRem: 4525 case Instruction::Shl: 4526 case Instruction::LShr: 4527 case Instruction::AShr: 4528 case Instruction::And: 4529 case Instruction::Or: 4530 case Instruction::Xor: { 4531 // Just widen unops and binops. 4532 setDebugLocFromInst(Builder, &I); 4533 4534 for (unsigned Part = 0; Part < UF; ++Part) { 4535 SmallVector<Value *, 2> Ops; 4536 for (VPValue *VPOp : User.operands()) 4537 Ops.push_back(State.get(VPOp, Part)); 4538 4539 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4540 4541 if (auto *VecOp = dyn_cast<Instruction>(V)) 4542 VecOp->copyIRFlags(&I); 4543 4544 // Use this vector value for all users of the original instruction. 4545 VectorLoopValueMap.setVectorValue(&I, Part, V); 4546 addMetadata(V, &I); 4547 } 4548 4549 break; 4550 } 4551 case Instruction::ICmp: 4552 case Instruction::FCmp: { 4553 // Widen compares. Generate vector compares. 4554 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4555 auto *Cmp = cast<CmpInst>(&I); 4556 setDebugLocFromInst(Builder, Cmp); 4557 for (unsigned Part = 0; Part < UF; ++Part) { 4558 Value *A = State.get(User.getOperand(0), Part); 4559 Value *B = State.get(User.getOperand(1), Part); 4560 Value *C = nullptr; 4561 if (FCmp) { 4562 // Propagate fast math flags. 4563 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4564 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4565 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4566 } else { 4567 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4568 } 4569 VectorLoopValueMap.setVectorValue(&I, Part, C); 4570 addMetadata(C, &I); 4571 } 4572 4573 break; 4574 } 4575 4576 case Instruction::ZExt: 4577 case Instruction::SExt: 4578 case Instruction::FPToUI: 4579 case Instruction::FPToSI: 4580 case Instruction::FPExt: 4581 case Instruction::PtrToInt: 4582 case Instruction::IntToPtr: 4583 case Instruction::SIToFP: 4584 case Instruction::UIToFP: 4585 case Instruction::Trunc: 4586 case Instruction::FPTrunc: 4587 case Instruction::BitCast: { 4588 auto *CI = cast<CastInst>(&I); 4589 setDebugLocFromInst(Builder, CI); 4590 4591 /// Vectorize casts. 4592 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4593 Type *DestTy = 4594 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4595 4596 for (unsigned Part = 0; Part < UF; ++Part) { 4597 Value *A = State.get(User.getOperand(0), Part); 4598 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4599 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4600 addMetadata(Cast, &I); 4601 } 4602 break; 4603 } 4604 default: 4605 // This instruction is not vectorized by simple widening. 4606 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4607 llvm_unreachable("Unhandled instruction!"); 4608 } // end of switch. 4609 } 4610 4611 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4612 VPTransformState &State) { 4613 assert(!isa<DbgInfoIntrinsic>(I) && 4614 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4615 setDebugLocFromInst(Builder, &I); 4616 4617 Module *M = I.getParent()->getParent()->getParent(); 4618 auto *CI = cast<CallInst>(&I); 4619 4620 SmallVector<Type *, 4> Tys; 4621 for (Value *ArgOperand : CI->arg_operands()) 4622 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4623 4624 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4625 4626 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4627 // version of the instruction. 4628 // Is it beneficial to perform intrinsic call compared to lib call? 4629 bool NeedToScalarize = false; 4630 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4631 bool UseVectorIntrinsic = 4632 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4633 assert((UseVectorIntrinsic || !NeedToScalarize) && 4634 "Instruction should be scalarized elsewhere."); 4635 4636 for (unsigned Part = 0; Part < UF; ++Part) { 4637 SmallVector<Value *, 4> Args; 4638 for (auto &I : enumerate(ArgOperands.operands())) { 4639 // Some intrinsics have a scalar argument - don't replace it with a 4640 // vector. 4641 Value *Arg; 4642 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4643 Arg = State.get(I.value(), Part); 4644 else 4645 Arg = State.get(I.value(), {0, 0}); 4646 Args.push_back(Arg); 4647 } 4648 4649 Function *VectorF; 4650 if (UseVectorIntrinsic) { 4651 // Use vector version of the intrinsic. 4652 Type *TysForDecl[] = {CI->getType()}; 4653 if (VF.isVector()) { 4654 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4655 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4656 } 4657 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4658 assert(VectorF && "Can't retrieve vector intrinsic."); 4659 } else { 4660 // Use vector version of the function call. 4661 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4662 #ifndef NDEBUG 4663 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4664 "Can't create vector function."); 4665 #endif 4666 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4667 } 4668 SmallVector<OperandBundleDef, 1> OpBundles; 4669 CI->getOperandBundlesAsDefs(OpBundles); 4670 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4671 4672 if (isa<FPMathOperator>(V)) 4673 V->copyFastMathFlags(CI); 4674 4675 VectorLoopValueMap.setVectorValue(&I, Part, V); 4676 addMetadata(V, &I); 4677 } 4678 } 4679 4680 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4681 VPUser &Operands, 4682 bool InvariantCond, 4683 VPTransformState &State) { 4684 setDebugLocFromInst(Builder, &I); 4685 4686 // The condition can be loop invariant but still defined inside the 4687 // loop. This means that we can't just use the original 'cond' value. 4688 // We have to take the 'vectorized' value and pick the first lane. 4689 // Instcombine will make this a no-op. 4690 auto *InvarCond = 4691 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4692 4693 for (unsigned Part = 0; Part < UF; ++Part) { 4694 Value *Cond = 4695 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4696 Value *Op0 = State.get(Operands.getOperand(1), Part); 4697 Value *Op1 = State.get(Operands.getOperand(2), Part); 4698 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4699 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4700 addMetadata(Sel, &I); 4701 } 4702 } 4703 4704 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4705 // We should not collect Scalars more than once per VF. Right now, this 4706 // function is called from collectUniformsAndScalars(), which already does 4707 // this check. Collecting Scalars for VF=1 does not make any sense. 4708 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4709 "This function should not be visited twice for the same VF"); 4710 4711 SmallSetVector<Instruction *, 8> Worklist; 4712 4713 // These sets are used to seed the analysis with pointers used by memory 4714 // accesses that will remain scalar. 4715 SmallSetVector<Instruction *, 8> ScalarPtrs; 4716 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4717 auto *Latch = TheLoop->getLoopLatch(); 4718 4719 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4720 // The pointer operands of loads and stores will be scalar as long as the 4721 // memory access is not a gather or scatter operation. The value operand of a 4722 // store will remain scalar if the store is scalarized. 4723 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4724 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4725 assert(WideningDecision != CM_Unknown && 4726 "Widening decision should be ready at this moment"); 4727 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4728 if (Ptr == Store->getValueOperand()) 4729 return WideningDecision == CM_Scalarize; 4730 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4731 "Ptr is neither a value or pointer operand"); 4732 return WideningDecision != CM_GatherScatter; 4733 }; 4734 4735 // A helper that returns true if the given value is a bitcast or 4736 // getelementptr instruction contained in the loop. 4737 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4738 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4739 isa<GetElementPtrInst>(V)) && 4740 !TheLoop->isLoopInvariant(V); 4741 }; 4742 4743 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4744 if (!isa<PHINode>(Ptr) || 4745 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4746 return false; 4747 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4748 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4749 return false; 4750 return isScalarUse(MemAccess, Ptr); 4751 }; 4752 4753 // A helper that evaluates a memory access's use of a pointer. If the 4754 // pointer is actually the pointer induction of a loop, it is being 4755 // inserted into Worklist. If the use will be a scalar use, and the 4756 // pointer is only used by memory accesses, we place the pointer in 4757 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4758 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4759 if (isScalarPtrInduction(MemAccess, Ptr)) { 4760 Worklist.insert(cast<Instruction>(Ptr)); 4761 Instruction *Update = cast<Instruction>( 4762 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4763 Worklist.insert(Update); 4764 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4765 << "\n"); 4766 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4767 << "\n"); 4768 return; 4769 } 4770 // We only care about bitcast and getelementptr instructions contained in 4771 // the loop. 4772 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4773 return; 4774 4775 // If the pointer has already been identified as scalar (e.g., if it was 4776 // also identified as uniform), there's nothing to do. 4777 auto *I = cast<Instruction>(Ptr); 4778 if (Worklist.count(I)) 4779 return; 4780 4781 // If the use of the pointer will be a scalar use, and all users of the 4782 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4783 // place the pointer in PossibleNonScalarPtrs. 4784 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4785 return isa<LoadInst>(U) || isa<StoreInst>(U); 4786 })) 4787 ScalarPtrs.insert(I); 4788 else 4789 PossibleNonScalarPtrs.insert(I); 4790 }; 4791 4792 // We seed the scalars analysis with three classes of instructions: (1) 4793 // instructions marked uniform-after-vectorization and (2) bitcast, 4794 // getelementptr and (pointer) phi instructions used by memory accesses 4795 // requiring a scalar use. 4796 // 4797 // (1) Add to the worklist all instructions that have been identified as 4798 // uniform-after-vectorization. 4799 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4800 4801 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4802 // memory accesses requiring a scalar use. The pointer operands of loads and 4803 // stores will be scalar as long as the memory accesses is not a gather or 4804 // scatter operation. The value operand of a store will remain scalar if the 4805 // store is scalarized. 4806 for (auto *BB : TheLoop->blocks()) 4807 for (auto &I : *BB) { 4808 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4809 evaluatePtrUse(Load, Load->getPointerOperand()); 4810 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4811 evaluatePtrUse(Store, Store->getPointerOperand()); 4812 evaluatePtrUse(Store, Store->getValueOperand()); 4813 } 4814 } 4815 for (auto *I : ScalarPtrs) 4816 if (!PossibleNonScalarPtrs.count(I)) { 4817 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4818 Worklist.insert(I); 4819 } 4820 4821 // Insert the forced scalars. 4822 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4823 // induction variable when the PHI user is scalarized. 4824 auto ForcedScalar = ForcedScalars.find(VF); 4825 if (ForcedScalar != ForcedScalars.end()) 4826 for (auto *I : ForcedScalar->second) 4827 Worklist.insert(I); 4828 4829 // Expand the worklist by looking through any bitcasts and getelementptr 4830 // instructions we've already identified as scalar. This is similar to the 4831 // expansion step in collectLoopUniforms(); however, here we're only 4832 // expanding to include additional bitcasts and getelementptr instructions. 4833 unsigned Idx = 0; 4834 while (Idx != Worklist.size()) { 4835 Instruction *Dst = Worklist[Idx++]; 4836 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4837 continue; 4838 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4839 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4840 auto *J = cast<Instruction>(U); 4841 return !TheLoop->contains(J) || Worklist.count(J) || 4842 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4843 isScalarUse(J, Src)); 4844 })) { 4845 Worklist.insert(Src); 4846 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4847 } 4848 } 4849 4850 // An induction variable will remain scalar if all users of the induction 4851 // variable and induction variable update remain scalar. 4852 for (auto &Induction : Legal->getInductionVars()) { 4853 auto *Ind = Induction.first; 4854 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4855 4856 // If tail-folding is applied, the primary induction variable will be used 4857 // to feed a vector compare. 4858 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4859 continue; 4860 4861 // Determine if all users of the induction variable are scalar after 4862 // vectorization. 4863 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4864 auto *I = cast<Instruction>(U); 4865 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4866 }); 4867 if (!ScalarInd) 4868 continue; 4869 4870 // Determine if all users of the induction variable update instruction are 4871 // scalar after vectorization. 4872 auto ScalarIndUpdate = 4873 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4874 auto *I = cast<Instruction>(U); 4875 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4876 }); 4877 if (!ScalarIndUpdate) 4878 continue; 4879 4880 // The induction variable and its update instruction will remain scalar. 4881 Worklist.insert(Ind); 4882 Worklist.insert(IndUpdate); 4883 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4884 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4885 << "\n"); 4886 } 4887 4888 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4889 } 4890 4891 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4892 ElementCount VF) { 4893 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4894 if (!blockNeedsPredication(I->getParent())) 4895 return false; 4896 switch(I->getOpcode()) { 4897 default: 4898 break; 4899 case Instruction::Load: 4900 case Instruction::Store: { 4901 if (!Legal->isMaskRequired(I)) 4902 return false; 4903 auto *Ptr = getLoadStorePointerOperand(I); 4904 auto *Ty = getMemInstValueType(I); 4905 // We have already decided how to vectorize this instruction, get that 4906 // result. 4907 if (VF.isVector()) { 4908 InstWidening WideningDecision = getWideningDecision(I, VF); 4909 assert(WideningDecision != CM_Unknown && 4910 "Widening decision should be ready at this moment"); 4911 return WideningDecision == CM_Scalarize; 4912 } 4913 const Align Alignment = getLoadStoreAlignment(I); 4914 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4915 isLegalMaskedGather(Ty, Alignment)) 4916 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4917 isLegalMaskedScatter(Ty, Alignment)); 4918 } 4919 case Instruction::UDiv: 4920 case Instruction::SDiv: 4921 case Instruction::SRem: 4922 case Instruction::URem: 4923 return mayDivideByZero(*I); 4924 } 4925 return false; 4926 } 4927 4928 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4929 Instruction *I, ElementCount VF) { 4930 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4931 assert(getWideningDecision(I, VF) == CM_Unknown && 4932 "Decision should not be set yet."); 4933 auto *Group = getInterleavedAccessGroup(I); 4934 assert(Group && "Must have a group."); 4935 4936 // If the instruction's allocated size doesn't equal it's type size, it 4937 // requires padding and will be scalarized. 4938 auto &DL = I->getModule()->getDataLayout(); 4939 auto *ScalarTy = getMemInstValueType(I); 4940 if (hasIrregularType(ScalarTy, DL, VF)) 4941 return false; 4942 4943 // Check if masking is required. 4944 // A Group may need masking for one of two reasons: it resides in a block that 4945 // needs predication, or it was decided to use masking to deal with gaps. 4946 bool PredicatedAccessRequiresMasking = 4947 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4948 bool AccessWithGapsRequiresMasking = 4949 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4950 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4951 return true; 4952 4953 // If masked interleaving is required, we expect that the user/target had 4954 // enabled it, because otherwise it either wouldn't have been created or 4955 // it should have been invalidated by the CostModel. 4956 assert(useMaskedInterleavedAccesses(TTI) && 4957 "Masked interleave-groups for predicated accesses are not enabled."); 4958 4959 auto *Ty = getMemInstValueType(I); 4960 const Align Alignment = getLoadStoreAlignment(I); 4961 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4962 : TTI.isLegalMaskedStore(Ty, Alignment); 4963 } 4964 4965 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4966 Instruction *I, ElementCount VF) { 4967 // Get and ensure we have a valid memory instruction. 4968 LoadInst *LI = dyn_cast<LoadInst>(I); 4969 StoreInst *SI = dyn_cast<StoreInst>(I); 4970 assert((LI || SI) && "Invalid memory instruction"); 4971 4972 auto *Ptr = getLoadStorePointerOperand(I); 4973 4974 // In order to be widened, the pointer should be consecutive, first of all. 4975 if (!Legal->isConsecutivePtr(Ptr)) 4976 return false; 4977 4978 // If the instruction is a store located in a predicated block, it will be 4979 // scalarized. 4980 if (isScalarWithPredication(I)) 4981 return false; 4982 4983 // If the instruction's allocated size doesn't equal it's type size, it 4984 // requires padding and will be scalarized. 4985 auto &DL = I->getModule()->getDataLayout(); 4986 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4987 if (hasIrregularType(ScalarTy, DL, VF)) 4988 return false; 4989 4990 return true; 4991 } 4992 4993 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4994 // We should not collect Uniforms more than once per VF. Right now, 4995 // this function is called from collectUniformsAndScalars(), which 4996 // already does this check. Collecting Uniforms for VF=1 does not make any 4997 // sense. 4998 4999 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5000 "This function should not be visited twice for the same VF"); 5001 5002 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5003 // not analyze again. Uniforms.count(VF) will return 1. 5004 Uniforms[VF].clear(); 5005 5006 // We now know that the loop is vectorizable! 5007 // Collect instructions inside the loop that will remain uniform after 5008 // vectorization. 5009 5010 // Global values, params and instructions outside of current loop are out of 5011 // scope. 5012 auto isOutOfScope = [&](Value *V) -> bool { 5013 Instruction *I = dyn_cast<Instruction>(V); 5014 return (!I || !TheLoop->contains(I)); 5015 }; 5016 5017 SetVector<Instruction *> Worklist; 5018 BasicBlock *Latch = TheLoop->getLoopLatch(); 5019 5020 // Instructions that are scalar with predication must not be considered 5021 // uniform after vectorization, because that would create an erroneous 5022 // replicating region where only a single instance out of VF should be formed. 5023 // TODO: optimize such seldom cases if found important, see PR40816. 5024 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5025 if (isScalarWithPredication(I, VF)) { 5026 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5027 << *I << "\n"); 5028 return; 5029 } 5030 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5031 Worklist.insert(I); 5032 }; 5033 5034 // Start with the conditional branch. If the branch condition is an 5035 // instruction contained in the loop that is only used by the branch, it is 5036 // uniform. 5037 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5038 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5039 addToWorklistIfAllowed(Cmp); 5040 5041 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5042 // are pointers that are treated like consecutive pointers during 5043 // vectorization. The pointer operands of interleaved accesses are an 5044 // example. 5045 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5046 5047 // Holds pointer operands of instructions that are possibly non-uniform. 5048 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5049 5050 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5051 InstWidening WideningDecision = getWideningDecision(I, VF); 5052 assert(WideningDecision != CM_Unknown && 5053 "Widening decision should be ready at this moment"); 5054 5055 return (WideningDecision == CM_Widen || 5056 WideningDecision == CM_Widen_Reverse || 5057 WideningDecision == CM_Interleave); 5058 }; 5059 // Iterate over the instructions in the loop, and collect all 5060 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5061 // that a consecutive-like pointer operand will be scalarized, we collect it 5062 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5063 // getelementptr instruction can be used by both vectorized and scalarized 5064 // memory instructions. For example, if a loop loads and stores from the same 5065 // location, but the store is conditional, the store will be scalarized, and 5066 // the getelementptr won't remain uniform. 5067 for (auto *BB : TheLoop->blocks()) 5068 for (auto &I : *BB) { 5069 // If there's no pointer operand, there's nothing to do. 5070 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5071 if (!Ptr) 5072 continue; 5073 5074 // True if all users of Ptr are memory accesses that have Ptr as their 5075 // pointer operand. 5076 auto UsersAreMemAccesses = 5077 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5078 return getLoadStorePointerOperand(U) == Ptr; 5079 }); 5080 5081 // Ensure the memory instruction will not be scalarized or used by 5082 // gather/scatter, making its pointer operand non-uniform. If the pointer 5083 // operand is used by any instruction other than a memory access, we 5084 // conservatively assume the pointer operand may be non-uniform. 5085 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5086 PossibleNonUniformPtrs.insert(Ptr); 5087 5088 // If the memory instruction will be vectorized and its pointer operand 5089 // is consecutive-like, or interleaving - the pointer operand should 5090 // remain uniform. 5091 else 5092 ConsecutiveLikePtrs.insert(Ptr); 5093 } 5094 5095 // Add to the Worklist all consecutive and consecutive-like pointers that 5096 // aren't also identified as possibly non-uniform. 5097 for (auto *V : ConsecutiveLikePtrs) 5098 if (!PossibleNonUniformPtrs.count(V)) 5099 addToWorklistIfAllowed(V); 5100 5101 // Expand Worklist in topological order: whenever a new instruction 5102 // is added , its users should be already inside Worklist. It ensures 5103 // a uniform instruction will only be used by uniform instructions. 5104 unsigned idx = 0; 5105 while (idx != Worklist.size()) { 5106 Instruction *I = Worklist[idx++]; 5107 5108 for (auto OV : I->operand_values()) { 5109 // isOutOfScope operands cannot be uniform instructions. 5110 if (isOutOfScope(OV)) 5111 continue; 5112 // First order recurrence Phi's should typically be considered 5113 // non-uniform. 5114 auto *OP = dyn_cast<PHINode>(OV); 5115 if (OP && Legal->isFirstOrderRecurrence(OP)) 5116 continue; 5117 // If all the users of the operand are uniform, then add the 5118 // operand into the uniform worklist. 5119 auto *OI = cast<Instruction>(OV); 5120 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5121 auto *J = cast<Instruction>(U); 5122 return Worklist.count(J) || 5123 (OI == getLoadStorePointerOperand(J) && 5124 isUniformDecision(J, VF)); 5125 })) 5126 addToWorklistIfAllowed(OI); 5127 } 5128 } 5129 5130 // Returns true if Ptr is the pointer operand of a memory access instruction 5131 // I, and I is known to not require scalarization. 5132 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5133 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5134 }; 5135 5136 // For an instruction to be added into Worklist above, all its users inside 5137 // the loop should also be in Worklist. However, this condition cannot be 5138 // true for phi nodes that form a cyclic dependence. We must process phi 5139 // nodes separately. An induction variable will remain uniform if all users 5140 // of the induction variable and induction variable update remain uniform. 5141 // The code below handles both pointer and non-pointer induction variables. 5142 for (auto &Induction : Legal->getInductionVars()) { 5143 auto *Ind = Induction.first; 5144 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5145 5146 // Determine if all users of the induction variable are uniform after 5147 // vectorization. 5148 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5149 auto *I = cast<Instruction>(U); 5150 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5151 isVectorizedMemAccessUse(I, Ind); 5152 }); 5153 if (!UniformInd) 5154 continue; 5155 5156 // Determine if all users of the induction variable update instruction are 5157 // uniform after vectorization. 5158 auto UniformIndUpdate = 5159 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5160 auto *I = cast<Instruction>(U); 5161 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5162 isVectorizedMemAccessUse(I, IndUpdate); 5163 }); 5164 if (!UniformIndUpdate) 5165 continue; 5166 5167 // The induction variable and its update instruction will remain uniform. 5168 addToWorklistIfAllowed(Ind); 5169 addToWorklistIfAllowed(IndUpdate); 5170 } 5171 5172 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5173 } 5174 5175 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5176 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5177 5178 if (Legal->getRuntimePointerChecking()->Need) { 5179 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5180 "runtime pointer checks needed. Enable vectorization of this " 5181 "loop with '#pragma clang loop vectorize(enable)' when " 5182 "compiling with -Os/-Oz", 5183 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5184 return true; 5185 } 5186 5187 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5188 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5189 "runtime SCEV checks needed. Enable vectorization of this " 5190 "loop with '#pragma clang loop vectorize(enable)' when " 5191 "compiling with -Os/-Oz", 5192 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5193 return true; 5194 } 5195 5196 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5197 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5198 reportVectorizationFailure("Runtime stride check for small trip count", 5199 "runtime stride == 1 checks needed. Enable vectorization of " 5200 "this loop without such check by compiling with -Os/-Oz", 5201 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5202 return true; 5203 } 5204 5205 return false; 5206 } 5207 5208 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5209 unsigned UserIC) { 5210 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5211 // TODO: It may by useful to do since it's still likely to be dynamically 5212 // uniform if the target can skip. 5213 reportVectorizationFailure( 5214 "Not inserting runtime ptr check for divergent target", 5215 "runtime pointer checks needed. Not enabled for divergent target", 5216 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5217 return None; 5218 } 5219 5220 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5221 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5222 if (TC == 1) { 5223 reportVectorizationFailure("Single iteration (non) loop", 5224 "loop trip count is one, irrelevant for vectorization", 5225 "SingleIterationLoop", ORE, TheLoop); 5226 return None; 5227 } 5228 5229 switch (ScalarEpilogueStatus) { 5230 case CM_ScalarEpilogueAllowed: 5231 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5232 case CM_ScalarEpilogueNotNeededUsePredicate: 5233 LLVM_DEBUG( 5234 dbgs() << "LV: vector predicate hint/switch found.\n" 5235 << "LV: Not allowing scalar epilogue, creating predicated " 5236 << "vector loop.\n"); 5237 break; 5238 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5239 // fallthrough as a special case of OptForSize 5240 case CM_ScalarEpilogueNotAllowedOptSize: 5241 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5242 LLVM_DEBUG( 5243 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5244 else 5245 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5246 << "count.\n"); 5247 5248 // Bail if runtime checks are required, which are not good when optimising 5249 // for size. 5250 if (runtimeChecksRequired()) 5251 return None; 5252 break; 5253 } 5254 5255 // Now try the tail folding 5256 5257 // Invalidate interleave groups that require an epilogue if we can't mask 5258 // the interleave-group. 5259 if (!useMaskedInterleavedAccesses(TTI)) { 5260 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5261 "No decisions should have been taken at this point"); 5262 // Note: There is no need to invalidate any cost modeling decisions here, as 5263 // non where taken so far. 5264 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5265 } 5266 5267 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5268 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5269 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5270 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5271 // Accept MaxVF if we do not have a tail. 5272 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5273 return MaxVF; 5274 } 5275 5276 // If we don't know the precise trip count, or if the trip count that we 5277 // found modulo the vectorization factor is not zero, try to fold the tail 5278 // by masking. 5279 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5280 if (Legal->prepareToFoldTailByMasking()) { 5281 FoldTailByMasking = true; 5282 return MaxVF; 5283 } 5284 5285 // If there was a tail-folding hint/switch, but we can't fold the tail by 5286 // masking, fallback to a vectorization with a scalar epilogue. 5287 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5288 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5289 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5290 return None; 5291 } 5292 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5293 "scalar epilogue instead.\n"); 5294 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5295 return MaxVF; 5296 } 5297 5298 if (TC == 0) { 5299 reportVectorizationFailure( 5300 "Unable to calculate the loop count due to complex control flow", 5301 "unable to calculate the loop count due to complex control flow", 5302 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5303 return None; 5304 } 5305 5306 reportVectorizationFailure( 5307 "Cannot optimize for size and vectorize at the same time.", 5308 "cannot optimize for size and vectorize at the same time. " 5309 "Enable vectorization of this loop with '#pragma clang loop " 5310 "vectorize(enable)' when compiling with -Os/-Oz", 5311 "NoTailLoopWithOptForSize", ORE, TheLoop); 5312 return None; 5313 } 5314 5315 unsigned 5316 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5317 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5318 unsigned SmallestType, WidestType; 5319 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5320 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5321 5322 // Get the maximum safe dependence distance in bits computed by LAA. 5323 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5324 // the memory accesses that is most restrictive (involved in the smallest 5325 // dependence distance). 5326 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5327 5328 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5329 5330 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5331 // Note that both WidestRegister and WidestType may not be a powers of 2. 5332 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5333 5334 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5335 << " / " << WidestType << " bits.\n"); 5336 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5337 << WidestRegister << " bits.\n"); 5338 5339 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5340 " into one vector!"); 5341 if (MaxVectorSize == 0) { 5342 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5343 MaxVectorSize = 1; 5344 return MaxVectorSize; 5345 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5346 isPowerOf2_32(ConstTripCount)) { 5347 // We need to clamp the VF to be the ConstTripCount. There is no point in 5348 // choosing a higher viable VF as done in the loop below. 5349 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5350 << ConstTripCount << "\n"); 5351 MaxVectorSize = ConstTripCount; 5352 return MaxVectorSize; 5353 } 5354 5355 unsigned MaxVF = MaxVectorSize; 5356 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5357 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5358 // Collect all viable vectorization factors larger than the default MaxVF 5359 // (i.e. MaxVectorSize). 5360 SmallVector<ElementCount, 8> VFs; 5361 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5362 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5363 VFs.push_back(ElementCount::getFixed(VS)); 5364 5365 // For each VF calculate its register usage. 5366 auto RUs = calculateRegisterUsage(VFs); 5367 5368 // Select the largest VF which doesn't require more registers than existing 5369 // ones. 5370 for (int i = RUs.size() - 1; i >= 0; --i) { 5371 bool Selected = true; 5372 for (auto& pair : RUs[i].MaxLocalUsers) { 5373 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5374 if (pair.second > TargetNumRegisters) 5375 Selected = false; 5376 } 5377 if (Selected) { 5378 MaxVF = VFs[i].getKnownMinValue(); 5379 break; 5380 } 5381 } 5382 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5383 if (MaxVF < MinVF) { 5384 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5385 << ") with target's minimum: " << MinVF << '\n'); 5386 MaxVF = MinVF; 5387 } 5388 } 5389 } 5390 return MaxVF; 5391 } 5392 5393 VectorizationFactor 5394 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5395 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5396 const float ScalarCost = Cost; 5397 unsigned Width = 1; 5398 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5399 5400 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5401 if (ForceVectorization && MaxVF > 1) { 5402 // Ignore scalar width, because the user explicitly wants vectorization. 5403 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5404 // evaluation. 5405 Cost = std::numeric_limits<float>::max(); 5406 } 5407 5408 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5409 // Notice that the vector loop needs to be executed less times, so 5410 // we need to divide the cost of the vector loops by the width of 5411 // the vector elements. 5412 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5413 float VectorCost = C.first / (float)i; 5414 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5415 << " costs: " << (int)VectorCost << ".\n"); 5416 if (!C.second && !ForceVectorization) { 5417 LLVM_DEBUG( 5418 dbgs() << "LV: Not considering vector loop of width " << i 5419 << " because it will not generate any vector instructions.\n"); 5420 continue; 5421 } 5422 if (VectorCost < Cost) { 5423 Cost = VectorCost; 5424 Width = i; 5425 } 5426 } 5427 5428 if (!EnableCondStoresVectorization && NumPredStores) { 5429 reportVectorizationFailure("There are conditional stores.", 5430 "store that is conditionally executed prevents vectorization", 5431 "ConditionalStore", ORE, TheLoop); 5432 Width = 1; 5433 Cost = ScalarCost; 5434 } 5435 5436 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5437 << "LV: Vectorization seems to be not beneficial, " 5438 << "but was forced by a user.\n"); 5439 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5440 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5441 (unsigned)(Width * Cost)}; 5442 return Factor; 5443 } 5444 5445 std::pair<unsigned, unsigned> 5446 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5447 unsigned MinWidth = -1U; 5448 unsigned MaxWidth = 8; 5449 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5450 5451 // For each block. 5452 for (BasicBlock *BB : TheLoop->blocks()) { 5453 // For each instruction in the loop. 5454 for (Instruction &I : BB->instructionsWithoutDebug()) { 5455 Type *T = I.getType(); 5456 5457 // Skip ignored values. 5458 if (ValuesToIgnore.count(&I)) 5459 continue; 5460 5461 // Only examine Loads, Stores and PHINodes. 5462 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5463 continue; 5464 5465 // Examine PHI nodes that are reduction variables. Update the type to 5466 // account for the recurrence type. 5467 if (auto *PN = dyn_cast<PHINode>(&I)) { 5468 if (!Legal->isReductionVariable(PN)) 5469 continue; 5470 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5471 T = RdxDesc.getRecurrenceType(); 5472 } 5473 5474 // Examine the stored values. 5475 if (auto *ST = dyn_cast<StoreInst>(&I)) 5476 T = ST->getValueOperand()->getType(); 5477 5478 // Ignore loaded pointer types and stored pointer types that are not 5479 // vectorizable. 5480 // 5481 // FIXME: The check here attempts to predict whether a load or store will 5482 // be vectorized. We only know this for certain after a VF has 5483 // been selected. Here, we assume that if an access can be 5484 // vectorized, it will be. We should also look at extending this 5485 // optimization to non-pointer types. 5486 // 5487 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5488 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5489 continue; 5490 5491 MinWidth = std::min(MinWidth, 5492 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5493 MaxWidth = std::max(MaxWidth, 5494 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5495 } 5496 } 5497 5498 return {MinWidth, MaxWidth}; 5499 } 5500 5501 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5502 unsigned LoopCost) { 5503 // -- The interleave heuristics -- 5504 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5505 // There are many micro-architectural considerations that we can't predict 5506 // at this level. For example, frontend pressure (on decode or fetch) due to 5507 // code size, or the number and capabilities of the execution ports. 5508 // 5509 // We use the following heuristics to select the interleave count: 5510 // 1. If the code has reductions, then we interleave to break the cross 5511 // iteration dependency. 5512 // 2. If the loop is really small, then we interleave to reduce the loop 5513 // overhead. 5514 // 3. We don't interleave if we think that we will spill registers to memory 5515 // due to the increased register pressure. 5516 5517 if (!isScalarEpilogueAllowed()) 5518 return 1; 5519 5520 // We used the distance for the interleave count. 5521 if (Legal->getMaxSafeDepDistBytes() != -1U) 5522 return 1; 5523 5524 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5525 const bool HasReductions = !Legal->getReductionVars().empty(); 5526 // Do not interleave loops with a relatively small known or estimated trip 5527 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5528 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5529 // because with the above conditions interleaving can expose ILP and break 5530 // cross iteration dependences for reductions. 5531 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5532 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5533 return 1; 5534 5535 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5536 // We divide by these constants so assume that we have at least one 5537 // instruction that uses at least one register. 5538 for (auto& pair : R.MaxLocalUsers) { 5539 pair.second = std::max(pair.second, 1U); 5540 } 5541 5542 // We calculate the interleave count using the following formula. 5543 // Subtract the number of loop invariants from the number of available 5544 // registers. These registers are used by all of the interleaved instances. 5545 // Next, divide the remaining registers by the number of registers that is 5546 // required by the loop, in order to estimate how many parallel instances 5547 // fit without causing spills. All of this is rounded down if necessary to be 5548 // a power of two. We want power of two interleave count to simplify any 5549 // addressing operations or alignment considerations. 5550 // We also want power of two interleave counts to ensure that the induction 5551 // variable of the vector loop wraps to zero, when tail is folded by masking; 5552 // this currently happens when OptForSize, in which case IC is set to 1 above. 5553 unsigned IC = UINT_MAX; 5554 5555 for (auto& pair : R.MaxLocalUsers) { 5556 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5557 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5558 << " registers of " 5559 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5560 if (VF.isScalar()) { 5561 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5562 TargetNumRegisters = ForceTargetNumScalarRegs; 5563 } else { 5564 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5565 TargetNumRegisters = ForceTargetNumVectorRegs; 5566 } 5567 unsigned MaxLocalUsers = pair.second; 5568 unsigned LoopInvariantRegs = 0; 5569 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5570 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5571 5572 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5573 // Don't count the induction variable as interleaved. 5574 if (EnableIndVarRegisterHeur) { 5575 TmpIC = 5576 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5577 std::max(1U, (MaxLocalUsers - 1))); 5578 } 5579 5580 IC = std::min(IC, TmpIC); 5581 } 5582 5583 // Clamp the interleave ranges to reasonable counts. 5584 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5585 unsigned MaxInterleaveCount = 5586 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5587 5588 // Check if the user has overridden the max. 5589 if (VF.isScalar()) { 5590 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5591 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5592 } else { 5593 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5594 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5595 } 5596 5597 // If trip count is known or estimated compile time constant, limit the 5598 // interleave count to be less than the trip count divided by VF. 5599 if (BestKnownTC) { 5600 MaxInterleaveCount = 5601 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5602 } 5603 5604 // If we did not calculate the cost for VF (because the user selected the VF) 5605 // then we calculate the cost of VF here. 5606 if (LoopCost == 0) 5607 LoopCost = expectedCost(VF).first; 5608 5609 assert(LoopCost && "Non-zero loop cost expected"); 5610 5611 // Clamp the calculated IC to be between the 1 and the max interleave count 5612 // that the target and trip count allows. 5613 if (IC > MaxInterleaveCount) 5614 IC = MaxInterleaveCount; 5615 else if (IC < 1) 5616 IC = 1; 5617 5618 // Interleave if we vectorized this loop and there is a reduction that could 5619 // benefit from interleaving. 5620 if (VF.isVector() && HasReductions) { 5621 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5622 return IC; 5623 } 5624 5625 // Note that if we've already vectorized the loop we will have done the 5626 // runtime check and so interleaving won't require further checks. 5627 bool InterleavingRequiresRuntimePointerCheck = 5628 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5629 5630 // We want to interleave small loops in order to reduce the loop overhead and 5631 // potentially expose ILP opportunities. 5632 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5633 << "LV: IC is " << IC << '\n' 5634 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 5635 const bool AggressivelyInterleaveReductions = 5636 TTI.enableAggressiveInterleaving(HasReductions); 5637 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5638 // We assume that the cost overhead is 1 and we use the cost model 5639 // to estimate the cost of the loop and interleave until the cost of the 5640 // loop overhead is about 5% of the cost of the loop. 5641 unsigned SmallIC = 5642 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5643 5644 // Interleave until store/load ports (estimated by max interleave count) are 5645 // saturated. 5646 unsigned NumStores = Legal->getNumStores(); 5647 unsigned NumLoads = Legal->getNumLoads(); 5648 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5649 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5650 5651 // If we have a scalar reduction (vector reductions are already dealt with 5652 // by this point), we can increase the critical path length if the loop 5653 // we're interleaving is inside another loop. Limit, by default to 2, so the 5654 // critical path only gets increased by one reduction operation. 5655 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5656 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5657 SmallIC = std::min(SmallIC, F); 5658 StoresIC = std::min(StoresIC, F); 5659 LoadsIC = std::min(LoadsIC, F); 5660 } 5661 5662 if (EnableLoadStoreRuntimeInterleave && 5663 std::max(StoresIC, LoadsIC) > SmallIC) { 5664 LLVM_DEBUG( 5665 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5666 return std::max(StoresIC, LoadsIC); 5667 } 5668 5669 // If there are scalar reductions and TTI has enabled aggressive 5670 // interleaving for reductions, we will interleave to expose ILP. 5671 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5672 AggressivelyInterleaveReductions) { 5673 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5674 // Interleave no less than SmallIC but not as aggressive as the normal IC 5675 // to satisfy the rare situation when resources are too limited. 5676 return std::max(IC / 2, SmallIC); 5677 } else { 5678 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5679 return SmallIC; 5680 } 5681 } 5682 5683 // Interleave if this is a large loop (small loops are already dealt with by 5684 // this point) that could benefit from interleaving. 5685 if (AggressivelyInterleaveReductions) { 5686 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5687 return IC; 5688 } 5689 5690 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5691 return 1; 5692 } 5693 5694 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5695 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5696 // This function calculates the register usage by measuring the highest number 5697 // of values that are alive at a single location. Obviously, this is a very 5698 // rough estimation. We scan the loop in a topological order in order and 5699 // assign a number to each instruction. We use RPO to ensure that defs are 5700 // met before their users. We assume that each instruction that has in-loop 5701 // users starts an interval. We record every time that an in-loop value is 5702 // used, so we have a list of the first and last occurrences of each 5703 // instruction. Next, we transpose this data structure into a multi map that 5704 // holds the list of intervals that *end* at a specific location. This multi 5705 // map allows us to perform a linear search. We scan the instructions linearly 5706 // and record each time that a new interval starts, by placing it in a set. 5707 // If we find this value in the multi-map then we remove it from the set. 5708 // The max register usage is the maximum size of the set. 5709 // We also search for instructions that are defined outside the loop, but are 5710 // used inside the loop. We need this number separately from the max-interval 5711 // usage number because when we unroll, loop-invariant values do not take 5712 // more register. 5713 LoopBlocksDFS DFS(TheLoop); 5714 DFS.perform(LI); 5715 5716 RegisterUsage RU; 5717 5718 // Each 'key' in the map opens a new interval. The values 5719 // of the map are the index of the 'last seen' usage of the 5720 // instruction that is the key. 5721 using IntervalMap = DenseMap<Instruction *, unsigned>; 5722 5723 // Maps instruction to its index. 5724 SmallVector<Instruction *, 64> IdxToInstr; 5725 // Marks the end of each interval. 5726 IntervalMap EndPoint; 5727 // Saves the list of instruction indices that are used in the loop. 5728 SmallPtrSet<Instruction *, 8> Ends; 5729 // Saves the list of values that are used in the loop but are 5730 // defined outside the loop, such as arguments and constants. 5731 SmallPtrSet<Value *, 8> LoopInvariants; 5732 5733 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5734 for (Instruction &I : BB->instructionsWithoutDebug()) { 5735 IdxToInstr.push_back(&I); 5736 5737 // Save the end location of each USE. 5738 for (Value *U : I.operands()) { 5739 auto *Instr = dyn_cast<Instruction>(U); 5740 5741 // Ignore non-instruction values such as arguments, constants, etc. 5742 if (!Instr) 5743 continue; 5744 5745 // If this instruction is outside the loop then record it and continue. 5746 if (!TheLoop->contains(Instr)) { 5747 LoopInvariants.insert(Instr); 5748 continue; 5749 } 5750 5751 // Overwrite previous end points. 5752 EndPoint[Instr] = IdxToInstr.size(); 5753 Ends.insert(Instr); 5754 } 5755 } 5756 } 5757 5758 // Saves the list of intervals that end with the index in 'key'. 5759 using InstrList = SmallVector<Instruction *, 2>; 5760 DenseMap<unsigned, InstrList> TransposeEnds; 5761 5762 // Transpose the EndPoints to a list of values that end at each index. 5763 for (auto &Interval : EndPoint) 5764 TransposeEnds[Interval.second].push_back(Interval.first); 5765 5766 SmallPtrSet<Instruction *, 8> OpenIntervals; 5767 5768 // Get the size of the widest register. 5769 unsigned MaxSafeDepDist = -1U; 5770 if (Legal->getMaxSafeDepDistBytes() != -1U) 5771 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5772 unsigned WidestRegister = 5773 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5774 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5775 5776 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5777 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5778 5779 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5780 5781 // A lambda that gets the register usage for the given type and VF. 5782 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5783 if (Ty->isTokenTy()) 5784 return 0U; 5785 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5786 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5787 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5788 WidestRegister); 5789 }; 5790 5791 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5792 Instruction *I = IdxToInstr[i]; 5793 5794 // Remove all of the instructions that end at this location. 5795 InstrList &List = TransposeEnds[i]; 5796 for (Instruction *ToRemove : List) 5797 OpenIntervals.erase(ToRemove); 5798 5799 // Ignore instructions that are never used within the loop. 5800 if (!Ends.count(I)) 5801 continue; 5802 5803 // Skip ignored values. 5804 if (ValuesToIgnore.count(I)) 5805 continue; 5806 5807 // For each VF find the maximum usage of registers. 5808 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5809 // Count the number of live intervals. 5810 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5811 5812 if (VFs[j].isScalar()) { 5813 for (auto Inst : OpenIntervals) { 5814 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5815 if (RegUsage.find(ClassID) == RegUsage.end()) 5816 RegUsage[ClassID] = 1; 5817 else 5818 RegUsage[ClassID] += 1; 5819 } 5820 } else { 5821 collectUniformsAndScalars(VFs[j]); 5822 for (auto Inst : OpenIntervals) { 5823 // Skip ignored values for VF > 1. 5824 if (VecValuesToIgnore.count(Inst)) 5825 continue; 5826 if (isScalarAfterVectorization(Inst, VFs[j])) { 5827 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5828 if (RegUsage.find(ClassID) == RegUsage.end()) 5829 RegUsage[ClassID] = 1; 5830 else 5831 RegUsage[ClassID] += 1; 5832 } else { 5833 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5834 if (RegUsage.find(ClassID) == RegUsage.end()) 5835 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5836 else 5837 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5838 } 5839 } 5840 } 5841 5842 for (auto& pair : RegUsage) { 5843 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5844 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5845 else 5846 MaxUsages[j][pair.first] = pair.second; 5847 } 5848 } 5849 5850 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5851 << OpenIntervals.size() << '\n'); 5852 5853 // Add the current instruction to the list of open intervals. 5854 OpenIntervals.insert(I); 5855 } 5856 5857 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5858 SmallMapVector<unsigned, unsigned, 4> Invariant; 5859 5860 for (auto Inst : LoopInvariants) { 5861 unsigned Usage = 5862 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5863 unsigned ClassID = 5864 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5865 if (Invariant.find(ClassID) == Invariant.end()) 5866 Invariant[ClassID] = Usage; 5867 else 5868 Invariant[ClassID] += Usage; 5869 } 5870 5871 LLVM_DEBUG({ 5872 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5873 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5874 << " item\n"; 5875 for (const auto &pair : MaxUsages[i]) { 5876 dbgs() << "LV(REG): RegisterClass: " 5877 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5878 << " registers\n"; 5879 } 5880 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5881 << " item\n"; 5882 for (const auto &pair : Invariant) { 5883 dbgs() << "LV(REG): RegisterClass: " 5884 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5885 << " registers\n"; 5886 } 5887 }); 5888 5889 RU.LoopInvariantRegs = Invariant; 5890 RU.MaxLocalUsers = MaxUsages[i]; 5891 RUs[i] = RU; 5892 } 5893 5894 return RUs; 5895 } 5896 5897 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5898 // TODO: Cost model for emulated masked load/store is completely 5899 // broken. This hack guides the cost model to use an artificially 5900 // high enough value to practically disable vectorization with such 5901 // operations, except where previously deployed legality hack allowed 5902 // using very low cost values. This is to avoid regressions coming simply 5903 // from moving "masked load/store" check from legality to cost model. 5904 // Masked Load/Gather emulation was previously never allowed. 5905 // Limited number of Masked Store/Scatter emulation was allowed. 5906 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5907 return isa<LoadInst>(I) || 5908 (isa<StoreInst>(I) && 5909 NumPredStores > NumberOfStoresToPredicate); 5910 } 5911 5912 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5913 // If we aren't vectorizing the loop, or if we've already collected the 5914 // instructions to scalarize, there's nothing to do. Collection may already 5915 // have occurred if we have a user-selected VF and are now computing the 5916 // expected cost for interleaving. 5917 if (VF.isScalar() || VF.isZero() || 5918 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5919 return; 5920 5921 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5922 // not profitable to scalarize any instructions, the presence of VF in the 5923 // map will indicate that we've analyzed it already. 5924 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5925 5926 // Find all the instructions that are scalar with predication in the loop and 5927 // determine if it would be better to not if-convert the blocks they are in. 5928 // If so, we also record the instructions to scalarize. 5929 for (BasicBlock *BB : TheLoop->blocks()) { 5930 if (!blockNeedsPredication(BB)) 5931 continue; 5932 for (Instruction &I : *BB) 5933 if (isScalarWithPredication(&I)) { 5934 ScalarCostsTy ScalarCosts; 5935 // Do not apply discount logic if hacked cost is needed 5936 // for emulated masked memrefs. 5937 if (!useEmulatedMaskMemRefHack(&I) && 5938 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5939 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5940 // Remember that BB will remain after vectorization. 5941 PredicatedBBsAfterVectorization.insert(BB); 5942 } 5943 } 5944 } 5945 5946 int LoopVectorizationCostModel::computePredInstDiscount( 5947 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5948 ElementCount VF) { 5949 assert(!isUniformAfterVectorization(PredInst, VF) && 5950 "Instruction marked uniform-after-vectorization will be predicated"); 5951 5952 // Initialize the discount to zero, meaning that the scalar version and the 5953 // vector version cost the same. 5954 int Discount = 0; 5955 5956 // Holds instructions to analyze. The instructions we visit are mapped in 5957 // ScalarCosts. Those instructions are the ones that would be scalarized if 5958 // we find that the scalar version costs less. 5959 SmallVector<Instruction *, 8> Worklist; 5960 5961 // Returns true if the given instruction can be scalarized. 5962 auto canBeScalarized = [&](Instruction *I) -> bool { 5963 // We only attempt to scalarize instructions forming a single-use chain 5964 // from the original predicated block that would otherwise be vectorized. 5965 // Although not strictly necessary, we give up on instructions we know will 5966 // already be scalar to avoid traversing chains that are unlikely to be 5967 // beneficial. 5968 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5969 isScalarAfterVectorization(I, VF)) 5970 return false; 5971 5972 // If the instruction is scalar with predication, it will be analyzed 5973 // separately. We ignore it within the context of PredInst. 5974 if (isScalarWithPredication(I)) 5975 return false; 5976 5977 // If any of the instruction's operands are uniform after vectorization, 5978 // the instruction cannot be scalarized. This prevents, for example, a 5979 // masked load from being scalarized. 5980 // 5981 // We assume we will only emit a value for lane zero of an instruction 5982 // marked uniform after vectorization, rather than VF identical values. 5983 // Thus, if we scalarize an instruction that uses a uniform, we would 5984 // create uses of values corresponding to the lanes we aren't emitting code 5985 // for. This behavior can be changed by allowing getScalarValue to clone 5986 // the lane zero values for uniforms rather than asserting. 5987 for (Use &U : I->operands()) 5988 if (auto *J = dyn_cast<Instruction>(U.get())) 5989 if (isUniformAfterVectorization(J, VF)) 5990 return false; 5991 5992 // Otherwise, we can scalarize the instruction. 5993 return true; 5994 }; 5995 5996 // Compute the expected cost discount from scalarizing the entire expression 5997 // feeding the predicated instruction. We currently only consider expressions 5998 // that are single-use instruction chains. 5999 Worklist.push_back(PredInst); 6000 while (!Worklist.empty()) { 6001 Instruction *I = Worklist.pop_back_val(); 6002 6003 // If we've already analyzed the instruction, there's nothing to do. 6004 if (ScalarCosts.find(I) != ScalarCosts.end()) 6005 continue; 6006 6007 // Compute the cost of the vector instruction. Note that this cost already 6008 // includes the scalarization overhead of the predicated instruction. 6009 unsigned VectorCost = getInstructionCost(I, VF).first; 6010 6011 // Compute the cost of the scalarized instruction. This cost is the cost of 6012 // the instruction as if it wasn't if-converted and instead remained in the 6013 // predicated block. We will scale this cost by block probability after 6014 // computing the scalarization overhead. 6015 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6016 unsigned ScalarCost = 6017 VF.getKnownMinValue() * 6018 getInstructionCost(I, ElementCount::getFixed(1)).first; 6019 6020 // Compute the scalarization overhead of needed insertelement instructions 6021 // and phi nodes. 6022 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6023 ScalarCost += TTI.getScalarizationOverhead( 6024 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6025 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6026 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6027 ScalarCost += 6028 VF.getKnownMinValue() * 6029 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6030 } 6031 6032 // Compute the scalarization overhead of needed extractelement 6033 // instructions. For each of the instruction's operands, if the operand can 6034 // be scalarized, add it to the worklist; otherwise, account for the 6035 // overhead. 6036 for (Use &U : I->operands()) 6037 if (auto *J = dyn_cast<Instruction>(U.get())) { 6038 assert(VectorType::isValidElementType(J->getType()) && 6039 "Instruction has non-scalar type"); 6040 if (canBeScalarized(J)) 6041 Worklist.push_back(J); 6042 else if (needsExtract(J, VF)) { 6043 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6044 ScalarCost += TTI.getScalarizationOverhead( 6045 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6046 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6047 } 6048 } 6049 6050 // Scale the total scalar cost by block probability. 6051 ScalarCost /= getReciprocalPredBlockProb(); 6052 6053 // Compute the discount. A non-negative discount means the vector version 6054 // of the instruction costs more, and scalarizing would be beneficial. 6055 Discount += VectorCost - ScalarCost; 6056 ScalarCosts[I] = ScalarCost; 6057 } 6058 6059 return Discount; 6060 } 6061 6062 LoopVectorizationCostModel::VectorizationCostTy 6063 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6064 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6065 VectorizationCostTy Cost; 6066 6067 // For each block. 6068 for (BasicBlock *BB : TheLoop->blocks()) { 6069 VectorizationCostTy BlockCost; 6070 6071 // For each instruction in the old loop. 6072 for (Instruction &I : BB->instructionsWithoutDebug()) { 6073 // Skip ignored values. 6074 if (ValuesToIgnore.count(&I) || 6075 (VF.isVector() && VecValuesToIgnore.count(&I))) 6076 continue; 6077 6078 VectorizationCostTy C = getInstructionCost(&I, VF); 6079 6080 // Check if we should override the cost. 6081 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6082 C.first = ForceTargetInstructionCost; 6083 6084 BlockCost.first += C.first; 6085 BlockCost.second |= C.second; 6086 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6087 << " for VF " << VF << " For instruction: " << I 6088 << '\n'); 6089 } 6090 6091 // If we are vectorizing a predicated block, it will have been 6092 // if-converted. This means that the block's instructions (aside from 6093 // stores and instructions that may divide by zero) will now be 6094 // unconditionally executed. For the scalar case, we may not always execute 6095 // the predicated block. Thus, scale the block's cost by the probability of 6096 // executing it. 6097 if (VF.isScalar() && blockNeedsPredication(BB)) 6098 BlockCost.first /= getReciprocalPredBlockProb(); 6099 6100 Cost.first += BlockCost.first; 6101 Cost.second |= BlockCost.second; 6102 } 6103 6104 return Cost; 6105 } 6106 6107 /// Gets Address Access SCEV after verifying that the access pattern 6108 /// is loop invariant except the induction variable dependence. 6109 /// 6110 /// This SCEV can be sent to the Target in order to estimate the address 6111 /// calculation cost. 6112 static const SCEV *getAddressAccessSCEV( 6113 Value *Ptr, 6114 LoopVectorizationLegality *Legal, 6115 PredicatedScalarEvolution &PSE, 6116 const Loop *TheLoop) { 6117 6118 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6119 if (!Gep) 6120 return nullptr; 6121 6122 // We are looking for a gep with all loop invariant indices except for one 6123 // which should be an induction variable. 6124 auto SE = PSE.getSE(); 6125 unsigned NumOperands = Gep->getNumOperands(); 6126 for (unsigned i = 1; i < NumOperands; ++i) { 6127 Value *Opd = Gep->getOperand(i); 6128 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6129 !Legal->isInductionVariable(Opd)) 6130 return nullptr; 6131 } 6132 6133 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6134 return PSE.getSCEV(Ptr); 6135 } 6136 6137 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6138 return Legal->hasStride(I->getOperand(0)) || 6139 Legal->hasStride(I->getOperand(1)); 6140 } 6141 6142 unsigned 6143 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6144 ElementCount VF) { 6145 assert(VF.isVector() && 6146 "Scalarization cost of instruction implies vectorization."); 6147 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6148 Type *ValTy = getMemInstValueType(I); 6149 auto SE = PSE.getSE(); 6150 6151 unsigned AS = getLoadStoreAddressSpace(I); 6152 Value *Ptr = getLoadStorePointerOperand(I); 6153 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6154 6155 // Figure out whether the access is strided and get the stride value 6156 // if it's known in compile time 6157 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6158 6159 // Get the cost of the scalar memory instruction and address computation. 6160 unsigned Cost = 6161 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6162 6163 // Don't pass *I here, since it is scalar but will actually be part of a 6164 // vectorized loop where the user of it is a vectorized instruction. 6165 const Align Alignment = getLoadStoreAlignment(I); 6166 Cost += VF.getKnownMinValue() * 6167 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6168 AS, TTI::TCK_RecipThroughput); 6169 6170 // Get the overhead of the extractelement and insertelement instructions 6171 // we might create due to scalarization. 6172 Cost += getScalarizationOverhead(I, VF); 6173 6174 // If we have a predicated store, it may not be executed for each vector 6175 // lane. Scale the cost by the probability of executing the predicated 6176 // block. 6177 if (isPredicatedInst(I)) { 6178 Cost /= getReciprocalPredBlockProb(); 6179 6180 if (useEmulatedMaskMemRefHack(I)) 6181 // Artificially setting to a high enough value to practically disable 6182 // vectorization with such operations. 6183 Cost = 3000000; 6184 } 6185 6186 return Cost; 6187 } 6188 6189 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6190 ElementCount VF) { 6191 Type *ValTy = getMemInstValueType(I); 6192 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6193 Value *Ptr = getLoadStorePointerOperand(I); 6194 unsigned AS = getLoadStoreAddressSpace(I); 6195 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6196 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6197 6198 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6199 "Stride should be 1 or -1 for consecutive memory access"); 6200 const Align Alignment = getLoadStoreAlignment(I); 6201 unsigned Cost = 0; 6202 if (Legal->isMaskRequired(I)) 6203 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6204 CostKind); 6205 else 6206 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6207 CostKind, I); 6208 6209 bool Reverse = ConsecutiveStride < 0; 6210 if (Reverse) 6211 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6212 return Cost; 6213 } 6214 6215 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6216 ElementCount VF) { 6217 Type *ValTy = getMemInstValueType(I); 6218 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6219 const Align Alignment = getLoadStoreAlignment(I); 6220 unsigned AS = getLoadStoreAddressSpace(I); 6221 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6222 if (isa<LoadInst>(I)) { 6223 return TTI.getAddressComputationCost(ValTy) + 6224 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6225 CostKind) + 6226 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6227 } 6228 StoreInst *SI = cast<StoreInst>(I); 6229 6230 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6231 return TTI.getAddressComputationCost(ValTy) + 6232 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6233 CostKind) + 6234 (isLoopInvariantStoreValue 6235 ? 0 6236 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6237 VF.getKnownMinValue() - 1)); 6238 } 6239 6240 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6241 ElementCount VF) { 6242 Type *ValTy = getMemInstValueType(I); 6243 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6244 const Align Alignment = getLoadStoreAlignment(I); 6245 const Value *Ptr = getLoadStorePointerOperand(I); 6246 6247 return TTI.getAddressComputationCost(VectorTy) + 6248 TTI.getGatherScatterOpCost( 6249 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6250 TargetTransformInfo::TCK_RecipThroughput, I); 6251 } 6252 6253 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6254 ElementCount VF) { 6255 Type *ValTy = getMemInstValueType(I); 6256 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6257 unsigned AS = getLoadStoreAddressSpace(I); 6258 6259 auto Group = getInterleavedAccessGroup(I); 6260 assert(Group && "Fail to get an interleaved access group."); 6261 6262 unsigned InterleaveFactor = Group->getFactor(); 6263 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6264 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6265 6266 // Holds the indices of existing members in an interleaved load group. 6267 // An interleaved store group doesn't need this as it doesn't allow gaps. 6268 SmallVector<unsigned, 4> Indices; 6269 if (isa<LoadInst>(I)) { 6270 for (unsigned i = 0; i < InterleaveFactor; i++) 6271 if (Group->getMember(i)) 6272 Indices.push_back(i); 6273 } 6274 6275 // Calculate the cost of the whole interleaved group. 6276 bool UseMaskForGaps = 6277 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6278 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6279 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6280 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6281 6282 if (Group->isReverse()) { 6283 // TODO: Add support for reversed masked interleaved access. 6284 assert(!Legal->isMaskRequired(I) && 6285 "Reverse masked interleaved access not supported."); 6286 Cost += Group->getNumMembers() * 6287 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6288 } 6289 return Cost; 6290 } 6291 6292 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6293 ElementCount VF) { 6294 // Calculate scalar cost only. Vectorization cost should be ready at this 6295 // moment. 6296 if (VF.isScalar()) { 6297 Type *ValTy = getMemInstValueType(I); 6298 const Align Alignment = getLoadStoreAlignment(I); 6299 unsigned AS = getLoadStoreAddressSpace(I); 6300 6301 return TTI.getAddressComputationCost(ValTy) + 6302 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6303 TTI::TCK_RecipThroughput, I); 6304 } 6305 return getWideningCost(I, VF); 6306 } 6307 6308 LoopVectorizationCostModel::VectorizationCostTy 6309 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6310 ElementCount VF) { 6311 assert(!VF.isScalable() && 6312 "the cost model is not yet implemented for scalable vectorization"); 6313 // If we know that this instruction will remain uniform, check the cost of 6314 // the scalar version. 6315 if (isUniformAfterVectorization(I, VF)) 6316 VF = ElementCount::getFixed(1); 6317 6318 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6319 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6320 6321 // Forced scalars do not have any scalarization overhead. 6322 auto ForcedScalar = ForcedScalars.find(VF); 6323 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6324 auto InstSet = ForcedScalar->second; 6325 if (InstSet.count(I)) 6326 return VectorizationCostTy( 6327 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6328 VF.getKnownMinValue()), 6329 false); 6330 } 6331 6332 Type *VectorTy; 6333 unsigned C = getInstructionCost(I, VF, VectorTy); 6334 6335 bool TypeNotScalarized = 6336 VF.isVector() && VectorTy->isVectorTy() && 6337 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6338 return VectorizationCostTy(C, TypeNotScalarized); 6339 } 6340 6341 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6342 ElementCount VF) { 6343 6344 assert(!VF.isScalable() && 6345 "cannot compute scalarization overhead for scalable vectorization"); 6346 if (VF.isScalar()) 6347 return 0; 6348 6349 unsigned Cost = 0; 6350 Type *RetTy = ToVectorTy(I->getType(), VF); 6351 if (!RetTy->isVoidTy() && 6352 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6353 Cost += TTI.getScalarizationOverhead( 6354 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6355 true, false); 6356 6357 // Some targets keep addresses scalar. 6358 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6359 return Cost; 6360 6361 // Some targets support efficient element stores. 6362 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6363 return Cost; 6364 6365 // Collect operands to consider. 6366 CallInst *CI = dyn_cast<CallInst>(I); 6367 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6368 6369 // Skip operands that do not require extraction/scalarization and do not incur 6370 // any overhead. 6371 return Cost + TTI.getOperandsScalarizationOverhead( 6372 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6373 } 6374 6375 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6376 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6377 if (VF.isScalar()) 6378 return; 6379 NumPredStores = 0; 6380 for (BasicBlock *BB : TheLoop->blocks()) { 6381 // For each instruction in the old loop. 6382 for (Instruction &I : *BB) { 6383 Value *Ptr = getLoadStorePointerOperand(&I); 6384 if (!Ptr) 6385 continue; 6386 6387 // TODO: We should generate better code and update the cost model for 6388 // predicated uniform stores. Today they are treated as any other 6389 // predicated store (see added test cases in 6390 // invariant-store-vectorization.ll). 6391 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6392 NumPredStores++; 6393 6394 if (Legal->isUniform(Ptr) && 6395 // Conditional loads and stores should be scalarized and predicated. 6396 // isScalarWithPredication cannot be used here since masked 6397 // gather/scatters are not considered scalar with predication. 6398 !Legal->blockNeedsPredication(I.getParent())) { 6399 // TODO: Avoid replicating loads and stores instead of 6400 // relying on instcombine to remove them. 6401 // Load: Scalar load + broadcast 6402 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6403 unsigned Cost = getUniformMemOpCost(&I, VF); 6404 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6405 continue; 6406 } 6407 6408 // We assume that widening is the best solution when possible. 6409 if (memoryInstructionCanBeWidened(&I, VF)) { 6410 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6411 int ConsecutiveStride = 6412 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6413 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6414 "Expected consecutive stride."); 6415 InstWidening Decision = 6416 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6417 setWideningDecision(&I, VF, Decision, Cost); 6418 continue; 6419 } 6420 6421 // Choose between Interleaving, Gather/Scatter or Scalarization. 6422 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6423 unsigned NumAccesses = 1; 6424 if (isAccessInterleaved(&I)) { 6425 auto Group = getInterleavedAccessGroup(&I); 6426 assert(Group && "Fail to get an interleaved access group."); 6427 6428 // Make one decision for the whole group. 6429 if (getWideningDecision(&I, VF) != CM_Unknown) 6430 continue; 6431 6432 NumAccesses = Group->getNumMembers(); 6433 if (interleavedAccessCanBeWidened(&I, VF)) 6434 InterleaveCost = getInterleaveGroupCost(&I, VF); 6435 } 6436 6437 unsigned GatherScatterCost = 6438 isLegalGatherOrScatter(&I) 6439 ? getGatherScatterCost(&I, VF) * NumAccesses 6440 : std::numeric_limits<unsigned>::max(); 6441 6442 unsigned ScalarizationCost = 6443 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6444 6445 // Choose better solution for the current VF, 6446 // write down this decision and use it during vectorization. 6447 unsigned Cost; 6448 InstWidening Decision; 6449 if (InterleaveCost <= GatherScatterCost && 6450 InterleaveCost < ScalarizationCost) { 6451 Decision = CM_Interleave; 6452 Cost = InterleaveCost; 6453 } else if (GatherScatterCost < ScalarizationCost) { 6454 Decision = CM_GatherScatter; 6455 Cost = GatherScatterCost; 6456 } else { 6457 Decision = CM_Scalarize; 6458 Cost = ScalarizationCost; 6459 } 6460 // If the instructions belongs to an interleave group, the whole group 6461 // receives the same decision. The whole group receives the cost, but 6462 // the cost will actually be assigned to one instruction. 6463 if (auto Group = getInterleavedAccessGroup(&I)) 6464 setWideningDecision(Group, VF, Decision, Cost); 6465 else 6466 setWideningDecision(&I, VF, Decision, Cost); 6467 } 6468 } 6469 6470 // Make sure that any load of address and any other address computation 6471 // remains scalar unless there is gather/scatter support. This avoids 6472 // inevitable extracts into address registers, and also has the benefit of 6473 // activating LSR more, since that pass can't optimize vectorized 6474 // addresses. 6475 if (TTI.prefersVectorizedAddressing()) 6476 return; 6477 6478 // Start with all scalar pointer uses. 6479 SmallPtrSet<Instruction *, 8> AddrDefs; 6480 for (BasicBlock *BB : TheLoop->blocks()) 6481 for (Instruction &I : *BB) { 6482 Instruction *PtrDef = 6483 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6484 if (PtrDef && TheLoop->contains(PtrDef) && 6485 getWideningDecision(&I, VF) != CM_GatherScatter) 6486 AddrDefs.insert(PtrDef); 6487 } 6488 6489 // Add all instructions used to generate the addresses. 6490 SmallVector<Instruction *, 4> Worklist; 6491 for (auto *I : AddrDefs) 6492 Worklist.push_back(I); 6493 while (!Worklist.empty()) { 6494 Instruction *I = Worklist.pop_back_val(); 6495 for (auto &Op : I->operands()) 6496 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6497 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6498 AddrDefs.insert(InstOp).second) 6499 Worklist.push_back(InstOp); 6500 } 6501 6502 for (auto *I : AddrDefs) { 6503 if (isa<LoadInst>(I)) { 6504 // Setting the desired widening decision should ideally be handled in 6505 // by cost functions, but since this involves the task of finding out 6506 // if the loaded register is involved in an address computation, it is 6507 // instead changed here when we know this is the case. 6508 InstWidening Decision = getWideningDecision(I, VF); 6509 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6510 // Scalarize a widened load of address. 6511 setWideningDecision( 6512 I, VF, CM_Scalarize, 6513 (VF.getKnownMinValue() * 6514 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6515 else if (auto Group = getInterleavedAccessGroup(I)) { 6516 // Scalarize an interleave group of address loads. 6517 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6518 if (Instruction *Member = Group->getMember(I)) 6519 setWideningDecision( 6520 Member, VF, CM_Scalarize, 6521 (VF.getKnownMinValue() * 6522 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6523 } 6524 } 6525 } else 6526 // Make sure I gets scalarized and a cost estimate without 6527 // scalarization overhead. 6528 ForcedScalars[VF].insert(I); 6529 } 6530 } 6531 6532 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6533 ElementCount VF, 6534 Type *&VectorTy) { 6535 Type *RetTy = I->getType(); 6536 if (canTruncateToMinimalBitwidth(I, VF)) 6537 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6538 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6539 auto SE = PSE.getSE(); 6540 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6541 6542 // TODO: We need to estimate the cost of intrinsic calls. 6543 switch (I->getOpcode()) { 6544 case Instruction::GetElementPtr: 6545 // We mark this instruction as zero-cost because the cost of GEPs in 6546 // vectorized code depends on whether the corresponding memory instruction 6547 // is scalarized or not. Therefore, we handle GEPs with the memory 6548 // instruction cost. 6549 return 0; 6550 case Instruction::Br: { 6551 // In cases of scalarized and predicated instructions, there will be VF 6552 // predicated blocks in the vectorized loop. Each branch around these 6553 // blocks requires also an extract of its vector compare i1 element. 6554 bool ScalarPredicatedBB = false; 6555 BranchInst *BI = cast<BranchInst>(I); 6556 if (VF.isVector() && BI->isConditional() && 6557 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6558 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6559 ScalarPredicatedBB = true; 6560 6561 if (ScalarPredicatedBB) { 6562 // Return cost for branches around scalarized and predicated blocks. 6563 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6564 auto *Vec_i1Ty = 6565 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6566 return (TTI.getScalarizationOverhead( 6567 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6568 false, true) + 6569 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6570 VF.getKnownMinValue())); 6571 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6572 // The back-edge branch will remain, as will all scalar branches. 6573 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6574 else 6575 // This branch will be eliminated by if-conversion. 6576 return 0; 6577 // Note: We currently assume zero cost for an unconditional branch inside 6578 // a predicated block since it will become a fall-through, although we 6579 // may decide in the future to call TTI for all branches. 6580 } 6581 case Instruction::PHI: { 6582 auto *Phi = cast<PHINode>(I); 6583 6584 // First-order recurrences are replaced by vector shuffles inside the loop. 6585 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6586 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6587 return TTI.getShuffleCost( 6588 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6589 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6590 6591 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6592 // converted into select instructions. We require N - 1 selects per phi 6593 // node, where N is the number of incoming values. 6594 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6595 return (Phi->getNumIncomingValues() - 1) * 6596 TTI.getCmpSelInstrCost( 6597 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6598 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6599 CostKind); 6600 6601 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6602 } 6603 case Instruction::UDiv: 6604 case Instruction::SDiv: 6605 case Instruction::URem: 6606 case Instruction::SRem: 6607 // If we have a predicated instruction, it may not be executed for each 6608 // vector lane. Get the scalarization cost and scale this amount by the 6609 // probability of executing the predicated block. If the instruction is not 6610 // predicated, we fall through to the next case. 6611 if (VF.isVector() && isScalarWithPredication(I)) { 6612 unsigned Cost = 0; 6613 6614 // These instructions have a non-void type, so account for the phi nodes 6615 // that we will create. This cost is likely to be zero. The phi node 6616 // cost, if any, should be scaled by the block probability because it 6617 // models a copy at the end of each predicated block. 6618 Cost += VF.getKnownMinValue() * 6619 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6620 6621 // The cost of the non-predicated instruction. 6622 Cost += VF.getKnownMinValue() * 6623 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6624 6625 // The cost of insertelement and extractelement instructions needed for 6626 // scalarization. 6627 Cost += getScalarizationOverhead(I, VF); 6628 6629 // Scale the cost by the probability of executing the predicated blocks. 6630 // This assumes the predicated block for each vector lane is equally 6631 // likely. 6632 return Cost / getReciprocalPredBlockProb(); 6633 } 6634 LLVM_FALLTHROUGH; 6635 case Instruction::Add: 6636 case Instruction::FAdd: 6637 case Instruction::Sub: 6638 case Instruction::FSub: 6639 case Instruction::Mul: 6640 case Instruction::FMul: 6641 case Instruction::FDiv: 6642 case Instruction::FRem: 6643 case Instruction::Shl: 6644 case Instruction::LShr: 6645 case Instruction::AShr: 6646 case Instruction::And: 6647 case Instruction::Or: 6648 case Instruction::Xor: { 6649 // Since we will replace the stride by 1 the multiplication should go away. 6650 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6651 return 0; 6652 // Certain instructions can be cheaper to vectorize if they have a constant 6653 // second vector operand. One example of this are shifts on x86. 6654 Value *Op2 = I->getOperand(1); 6655 TargetTransformInfo::OperandValueProperties Op2VP; 6656 TargetTransformInfo::OperandValueKind Op2VK = 6657 TTI.getOperandInfo(Op2, Op2VP); 6658 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6659 Op2VK = TargetTransformInfo::OK_UniformValue; 6660 6661 SmallVector<const Value *, 4> Operands(I->operand_values()); 6662 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6663 return N * TTI.getArithmeticInstrCost( 6664 I->getOpcode(), VectorTy, CostKind, 6665 TargetTransformInfo::OK_AnyValue, 6666 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6667 } 6668 case Instruction::FNeg: { 6669 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6670 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6671 return N * TTI.getArithmeticInstrCost( 6672 I->getOpcode(), VectorTy, CostKind, 6673 TargetTransformInfo::OK_AnyValue, 6674 TargetTransformInfo::OK_AnyValue, 6675 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6676 I->getOperand(0), I); 6677 } 6678 case Instruction::Select: { 6679 SelectInst *SI = cast<SelectInst>(I); 6680 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6681 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6682 Type *CondTy = SI->getCondition()->getType(); 6683 if (!ScalarCond) { 6684 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6685 CondTy = VectorType::get(CondTy, VF); 6686 } 6687 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6688 CostKind, I); 6689 } 6690 case Instruction::ICmp: 6691 case Instruction::FCmp: { 6692 Type *ValTy = I->getOperand(0)->getType(); 6693 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6694 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6695 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6696 VectorTy = ToVectorTy(ValTy, VF); 6697 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6698 I); 6699 } 6700 case Instruction::Store: 6701 case Instruction::Load: { 6702 ElementCount Width = VF; 6703 if (Width.isVector()) { 6704 InstWidening Decision = getWideningDecision(I, Width); 6705 assert(Decision != CM_Unknown && 6706 "CM decision should be taken at this point"); 6707 if (Decision == CM_Scalarize) 6708 Width = ElementCount::getFixed(1); 6709 } 6710 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6711 return getMemoryInstructionCost(I, VF); 6712 } 6713 case Instruction::ZExt: 6714 case Instruction::SExt: 6715 case Instruction::FPToUI: 6716 case Instruction::FPToSI: 6717 case Instruction::FPExt: 6718 case Instruction::PtrToInt: 6719 case Instruction::IntToPtr: 6720 case Instruction::SIToFP: 6721 case Instruction::UIToFP: 6722 case Instruction::Trunc: 6723 case Instruction::FPTrunc: 6724 case Instruction::BitCast: { 6725 // Computes the CastContextHint from a Load/Store instruction. 6726 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6727 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6728 "Expected a load or a store!"); 6729 6730 if (VF.isScalar() || !TheLoop->contains(I)) 6731 return TTI::CastContextHint::Normal; 6732 6733 switch (getWideningDecision(I, VF)) { 6734 case LoopVectorizationCostModel::CM_GatherScatter: 6735 return TTI::CastContextHint::GatherScatter; 6736 case LoopVectorizationCostModel::CM_Interleave: 6737 return TTI::CastContextHint::Interleave; 6738 case LoopVectorizationCostModel::CM_Scalarize: 6739 case LoopVectorizationCostModel::CM_Widen: 6740 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6741 : TTI::CastContextHint::Normal; 6742 case LoopVectorizationCostModel::CM_Widen_Reverse: 6743 return TTI::CastContextHint::Reversed; 6744 case LoopVectorizationCostModel::CM_Unknown: 6745 llvm_unreachable("Instr did not go through cost modelling?"); 6746 } 6747 6748 llvm_unreachable("Unhandled case!"); 6749 }; 6750 6751 unsigned Opcode = I->getOpcode(); 6752 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6753 // For Trunc, the context is the only user, which must be a StoreInst. 6754 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6755 if (I->hasOneUse()) 6756 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6757 CCH = ComputeCCH(Store); 6758 } 6759 // For Z/Sext, the context is the operand, which must be a LoadInst. 6760 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6761 Opcode == Instruction::FPExt) { 6762 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6763 CCH = ComputeCCH(Load); 6764 } 6765 6766 // We optimize the truncation of induction variables having constant 6767 // integer steps. The cost of these truncations is the same as the scalar 6768 // operation. 6769 if (isOptimizableIVTruncate(I, VF)) { 6770 auto *Trunc = cast<TruncInst>(I); 6771 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6772 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6773 } 6774 6775 Type *SrcScalarTy = I->getOperand(0)->getType(); 6776 Type *SrcVecTy = 6777 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6778 if (canTruncateToMinimalBitwidth(I, VF)) { 6779 // This cast is going to be shrunk. This may remove the cast or it might 6780 // turn it into slightly different cast. For example, if MinBW == 16, 6781 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6782 // 6783 // Calculate the modified src and dest types. 6784 Type *MinVecTy = VectorTy; 6785 if (Opcode == Instruction::Trunc) { 6786 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6787 VectorTy = 6788 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6789 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6790 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6791 VectorTy = 6792 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6793 } 6794 } 6795 6796 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6797 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6798 return N * 6799 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6800 } 6801 case Instruction::Call: { 6802 bool NeedToScalarize; 6803 CallInst *CI = cast<CallInst>(I); 6804 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6805 if (getVectorIntrinsicIDForCall(CI, TLI)) 6806 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6807 return CallCost; 6808 } 6809 default: 6810 // The cost of executing VF copies of the scalar instruction. This opcode 6811 // is unknown. Assume that it is the same as 'mul'. 6812 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6813 Instruction::Mul, VectorTy, CostKind) + 6814 getScalarizationOverhead(I, VF); 6815 } // end of switch. 6816 } 6817 6818 char LoopVectorize::ID = 0; 6819 6820 static const char lv_name[] = "Loop Vectorization"; 6821 6822 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6823 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6824 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6825 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6826 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6827 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6828 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6829 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6830 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6831 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6832 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6833 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6834 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6835 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6836 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6837 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6838 6839 namespace llvm { 6840 6841 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6842 6843 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6844 bool VectorizeOnlyWhenForced) { 6845 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6846 } 6847 6848 } // end namespace llvm 6849 6850 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6851 // Check if the pointer operand of a load or store instruction is 6852 // consecutive. 6853 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6854 return Legal->isConsecutivePtr(Ptr); 6855 return false; 6856 } 6857 6858 void LoopVectorizationCostModel::collectValuesToIgnore() { 6859 // Ignore ephemeral values. 6860 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6861 6862 // Ignore type-promoting instructions we identified during reduction 6863 // detection. 6864 for (auto &Reduction : Legal->getReductionVars()) { 6865 RecurrenceDescriptor &RedDes = Reduction.second; 6866 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6867 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6868 } 6869 // Ignore type-casting instructions we identified during induction 6870 // detection. 6871 for (auto &Induction : Legal->getInductionVars()) { 6872 InductionDescriptor &IndDes = Induction.second; 6873 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6874 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6875 } 6876 } 6877 6878 void LoopVectorizationCostModel::collectInLoopReductions() { 6879 // For the moment, without predicated reduction instructions, we do not 6880 // support inloop reductions whilst folding the tail, and hence in those cases 6881 // all reductions are currently out of the loop. 6882 if (foldTailByMasking()) 6883 return; 6884 6885 for (auto &Reduction : Legal->getReductionVars()) { 6886 PHINode *Phi = Reduction.first; 6887 RecurrenceDescriptor &RdxDesc = Reduction.second; 6888 6889 // We don't collect reductions that are type promoted (yet). 6890 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6891 continue; 6892 6893 // If the target would prefer this reduction to happen "in-loop", then we 6894 // want to record it as such. 6895 unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()); 6896 if (!PreferInLoopReductions && 6897 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 6898 TargetTransformInfo::ReductionFlags())) 6899 continue; 6900 6901 // Check that we can correctly put the reductions into the loop, by 6902 // finding the chain of operations that leads from the phi to the loop 6903 // exit value. 6904 SmallVector<Instruction *, 4> ReductionOperations = 6905 RdxDesc.getReductionOpChain(Phi, TheLoop); 6906 bool InLoop = !ReductionOperations.empty(); 6907 if (InLoop) 6908 InLoopReductionChains[Phi] = ReductionOperations; 6909 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6910 << " reduction for phi: " << *Phi << "\n"); 6911 } 6912 } 6913 6914 // TODO: we could return a pair of values that specify the max VF and 6915 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6916 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6917 // doesn't have a cost model that can choose which plan to execute if 6918 // more than one is generated. 6919 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6920 LoopVectorizationCostModel &CM) { 6921 unsigned WidestType; 6922 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6923 return WidestVectorRegBits / WidestType; 6924 } 6925 6926 VectorizationFactor 6927 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6928 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6929 ElementCount VF = UserVF; 6930 // Outer loop handling: They may require CFG and instruction level 6931 // transformations before even evaluating whether vectorization is profitable. 6932 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6933 // the vectorization pipeline. 6934 if (!OrigLoop->isInnermost()) { 6935 // If the user doesn't provide a vectorization factor, determine a 6936 // reasonable one. 6937 if (UserVF.isZero()) { 6938 VF = ElementCount::getFixed( 6939 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6940 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6941 6942 // Make sure we have a VF > 1 for stress testing. 6943 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6944 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6945 << "overriding computed VF.\n"); 6946 VF = ElementCount::getFixed(4); 6947 } 6948 } 6949 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6950 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6951 "VF needs to be a power of two"); 6952 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6953 << "VF " << VF << " to build VPlans.\n"); 6954 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6955 6956 // For VPlan build stress testing, we bail out after VPlan construction. 6957 if (VPlanBuildStressTest) 6958 return VectorizationFactor::Disabled(); 6959 6960 return {VF, 0 /*Cost*/}; 6961 } 6962 6963 LLVM_DEBUG( 6964 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6965 "VPlan-native path.\n"); 6966 return VectorizationFactor::Disabled(); 6967 } 6968 6969 Optional<VectorizationFactor> 6970 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6971 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6972 assert(OrigLoop->isInnermost() && "Inner loop expected."); 6973 Optional<unsigned> MaybeMaxVF = 6974 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6975 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6976 return None; 6977 6978 // Invalidate interleave groups if all blocks of loop will be predicated. 6979 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6980 !useMaskedInterleavedAccesses(*TTI)) { 6981 LLVM_DEBUG( 6982 dbgs() 6983 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6984 "which requires masked-interleaved support.\n"); 6985 if (CM.InterleaveInfo.invalidateGroups()) 6986 // Invalidating interleave groups also requires invalidating all decisions 6987 // based on them, which includes widening decisions and uniform and scalar 6988 // values. 6989 CM.invalidateCostModelingDecisions(); 6990 } 6991 6992 if (!UserVF.isZero()) { 6993 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6994 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6995 "VF needs to be a power of two"); 6996 // Collect the instructions (and their associated costs) that will be more 6997 // profitable to scalarize. 6998 CM.selectUserVectorizationFactor(UserVF); 6999 CM.collectInLoopReductions(); 7000 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 7001 UserVF.getKnownMinValue()); 7002 LLVM_DEBUG(printPlans(dbgs())); 7003 return {{UserVF, 0}}; 7004 } 7005 7006 unsigned MaxVF = MaybeMaxVF.getValue(); 7007 assert(MaxVF != 0 && "MaxVF is zero."); 7008 7009 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 7010 // Collect Uniform and Scalar instructions after vectorization with VF. 7011 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 7012 7013 // Collect the instructions (and their associated costs) that will be more 7014 // profitable to scalarize. 7015 if (VF > 1) 7016 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 7017 } 7018 7019 CM.collectInLoopReductions(); 7020 7021 buildVPlansWithVPRecipes(1, MaxVF); 7022 LLVM_DEBUG(printPlans(dbgs())); 7023 if (MaxVF == 1) 7024 return VectorizationFactor::Disabled(); 7025 7026 // Select the optimal vectorization factor. 7027 return CM.selectVectorizationFactor(MaxVF); 7028 } 7029 7030 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7031 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7032 << '\n'); 7033 BestVF = VF; 7034 BestUF = UF; 7035 7036 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7037 return !Plan->hasVF(VF); 7038 }); 7039 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7040 } 7041 7042 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7043 DominatorTree *DT) { 7044 // Perform the actual loop transformation. 7045 7046 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7047 VPCallbackILV CallbackILV(ILV); 7048 7049 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7050 7051 VPTransformState State{*BestVF, BestUF, LI, 7052 DT, ILV.Builder, ILV.VectorLoopValueMap, 7053 &ILV, CallbackILV}; 7054 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7055 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7056 State.CanonicalIV = ILV.Induction; 7057 7058 //===------------------------------------------------===// 7059 // 7060 // Notice: any optimization or new instruction that go 7061 // into the code below should also be implemented in 7062 // the cost-model. 7063 // 7064 //===------------------------------------------------===// 7065 7066 // 2. Copy and widen instructions from the old loop into the new loop. 7067 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7068 VPlans.front()->execute(&State); 7069 7070 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7071 // predication, updating analyses. 7072 ILV.fixVectorizedLoop(); 7073 } 7074 7075 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7076 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7077 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7078 7079 // We create new control-flow for the vectorized loop, so the original 7080 // condition will be dead after vectorization if it's only used by the 7081 // branch. 7082 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7083 if (Cmp && Cmp->hasOneUse()) 7084 DeadInstructions.insert(Cmp); 7085 7086 // We create new "steps" for induction variable updates to which the original 7087 // induction variables map. An original update instruction will be dead if 7088 // all its users except the induction variable are dead. 7089 for (auto &Induction : Legal->getInductionVars()) { 7090 PHINode *Ind = Induction.first; 7091 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7092 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7093 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7094 })) 7095 DeadInstructions.insert(IndUpdate); 7096 7097 // We record as "Dead" also the type-casting instructions we had identified 7098 // during induction analysis. We don't need any handling for them in the 7099 // vectorized loop because we have proven that, under a proper runtime 7100 // test guarding the vectorized loop, the value of the phi, and the casted 7101 // value of the phi, are the same. The last instruction in this casting chain 7102 // will get its scalar/vector/widened def from the scalar/vector/widened def 7103 // of the respective phi node. Any other casts in the induction def-use chain 7104 // have no other uses outside the phi update chain, and will be ignored. 7105 InductionDescriptor &IndDes = Induction.second; 7106 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7107 DeadInstructions.insert(Casts.begin(), Casts.end()); 7108 } 7109 } 7110 7111 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7112 7113 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7114 7115 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7116 Instruction::BinaryOps BinOp) { 7117 // When unrolling and the VF is 1, we only need to add a simple scalar. 7118 Type *Ty = Val->getType(); 7119 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7120 7121 if (Ty->isFloatingPointTy()) { 7122 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7123 7124 // Floating point operations had to be 'fast' to enable the unrolling. 7125 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7126 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7127 } 7128 Constant *C = ConstantInt::get(Ty, StartIdx); 7129 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7130 } 7131 7132 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7133 SmallVector<Metadata *, 4> MDs; 7134 // Reserve first location for self reference to the LoopID metadata node. 7135 MDs.push_back(nullptr); 7136 bool IsUnrollMetadata = false; 7137 MDNode *LoopID = L->getLoopID(); 7138 if (LoopID) { 7139 // First find existing loop unrolling disable metadata. 7140 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7141 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7142 if (MD) { 7143 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7144 IsUnrollMetadata = 7145 S && S->getString().startswith("llvm.loop.unroll.disable"); 7146 } 7147 MDs.push_back(LoopID->getOperand(i)); 7148 } 7149 } 7150 7151 if (!IsUnrollMetadata) { 7152 // Add runtime unroll disable metadata. 7153 LLVMContext &Context = L->getHeader()->getContext(); 7154 SmallVector<Metadata *, 1> DisableOperands; 7155 DisableOperands.push_back( 7156 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7157 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7158 MDs.push_back(DisableNode); 7159 MDNode *NewLoopID = MDNode::get(Context, MDs); 7160 // Set operand 0 to refer to the loop id itself. 7161 NewLoopID->replaceOperandWith(0, NewLoopID); 7162 L->setLoopID(NewLoopID); 7163 } 7164 } 7165 7166 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7167 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7168 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7169 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7170 7171 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7172 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7173 Range.End = TmpVF; 7174 break; 7175 } 7176 7177 return PredicateAtRangeStart; 7178 } 7179 7180 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7181 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7182 /// of VF's starting at a given VF and extending it as much as possible. Each 7183 /// vectorization decision can potentially shorten this sub-range during 7184 /// buildVPlan(). 7185 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7186 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7187 VFRange SubRange = {VF, MaxVF + 1}; 7188 VPlans.push_back(buildVPlan(SubRange)); 7189 VF = SubRange.End; 7190 } 7191 } 7192 7193 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7194 VPlanPtr &Plan) { 7195 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7196 7197 // Look for cached value. 7198 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7199 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7200 if (ECEntryIt != EdgeMaskCache.end()) 7201 return ECEntryIt->second; 7202 7203 VPValue *SrcMask = createBlockInMask(Src, Plan); 7204 7205 // The terminator has to be a branch inst! 7206 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7207 assert(BI && "Unexpected terminator found"); 7208 7209 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7210 return EdgeMaskCache[Edge] = SrcMask; 7211 7212 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7213 assert(EdgeMask && "No Edge Mask found for condition"); 7214 7215 if (BI->getSuccessor(0) != Dst) 7216 EdgeMask = Builder.createNot(EdgeMask); 7217 7218 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7219 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7220 7221 return EdgeMaskCache[Edge] = EdgeMask; 7222 } 7223 7224 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7225 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7226 7227 // Look for cached value. 7228 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7229 if (BCEntryIt != BlockMaskCache.end()) 7230 return BCEntryIt->second; 7231 7232 // All-one mask is modelled as no-mask following the convention for masked 7233 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7234 VPValue *BlockMask = nullptr; 7235 7236 if (OrigLoop->getHeader() == BB) { 7237 if (!CM.blockNeedsPredication(BB)) 7238 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7239 7240 // Introduce the early-exit compare IV <= BTC to form header block mask. 7241 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7242 // Start by constructing the desired canonical IV. 7243 VPValue *IV = nullptr; 7244 if (Legal->getPrimaryInduction()) 7245 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7246 else { 7247 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7248 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7249 IV = IVRecipe->getVPValue(); 7250 } 7251 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7252 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7253 7254 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7255 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7256 // as a second argument, we only pass the IV here and extract the 7257 // tripcount from the transform state where codegen of the VP instructions 7258 // happen. 7259 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7260 } else { 7261 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7262 } 7263 return BlockMaskCache[BB] = BlockMask; 7264 } 7265 7266 // This is the block mask. We OR all incoming edges. 7267 for (auto *Predecessor : predecessors(BB)) { 7268 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7269 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7270 return BlockMaskCache[BB] = EdgeMask; 7271 7272 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7273 BlockMask = EdgeMask; 7274 continue; 7275 } 7276 7277 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7278 } 7279 7280 return BlockMaskCache[BB] = BlockMask; 7281 } 7282 7283 VPWidenMemoryInstructionRecipe * 7284 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7285 VPlanPtr &Plan) { 7286 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7287 "Must be called with either a load or store"); 7288 7289 auto willWiden = [&](ElementCount VF) -> bool { 7290 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7291 if (VF.isScalar()) 7292 return false; 7293 LoopVectorizationCostModel::InstWidening Decision = 7294 CM.getWideningDecision(I, VF); 7295 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7296 "CM decision should be taken at this point."); 7297 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7298 return true; 7299 if (CM.isScalarAfterVectorization(I, VF) || 7300 CM.isProfitableToScalarize(I, VF)) 7301 return false; 7302 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7303 }; 7304 7305 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7306 return nullptr; 7307 7308 VPValue *Mask = nullptr; 7309 if (Legal->isMaskRequired(I)) 7310 Mask = createBlockInMask(I->getParent(), Plan); 7311 7312 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7313 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7314 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7315 7316 StoreInst *Store = cast<StoreInst>(I); 7317 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7318 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7319 } 7320 7321 VPWidenIntOrFpInductionRecipe * 7322 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7323 // Check if this is an integer or fp induction. If so, build the recipe that 7324 // produces its scalar and vector values. 7325 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7326 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7327 II.getKind() == InductionDescriptor::IK_FpInduction) 7328 return new VPWidenIntOrFpInductionRecipe(Phi); 7329 7330 return nullptr; 7331 } 7332 7333 VPWidenIntOrFpInductionRecipe * 7334 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7335 VFRange &Range) const { 7336 // Optimize the special case where the source is a constant integer 7337 // induction variable. Notice that we can only optimize the 'trunc' case 7338 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7339 // (c) other casts depend on pointer size. 7340 7341 // Determine whether \p K is a truncation based on an induction variable that 7342 // can be optimized. 7343 auto isOptimizableIVTruncate = 7344 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7345 return [=](ElementCount VF) -> bool { 7346 return CM.isOptimizableIVTruncate(K, VF); 7347 }; 7348 }; 7349 7350 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7351 isOptimizableIVTruncate(I), Range)) 7352 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7353 I); 7354 return nullptr; 7355 } 7356 7357 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7358 // We know that all PHIs in non-header blocks are converted into selects, so 7359 // we don't have to worry about the insertion order and we can just use the 7360 // builder. At this point we generate the predication tree. There may be 7361 // duplications since this is a simple recursive scan, but future 7362 // optimizations will clean it up. 7363 7364 SmallVector<VPValue *, 2> Operands; 7365 unsigned NumIncoming = Phi->getNumIncomingValues(); 7366 for (unsigned In = 0; In < NumIncoming; In++) { 7367 VPValue *EdgeMask = 7368 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7369 assert((EdgeMask || NumIncoming == 1) && 7370 "Multiple predecessors with one having a full mask"); 7371 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7372 if (EdgeMask) 7373 Operands.push_back(EdgeMask); 7374 } 7375 return new VPBlendRecipe(Phi, Operands); 7376 } 7377 7378 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7379 VPlan &Plan) const { 7380 7381 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7382 [this, CI](ElementCount VF) { 7383 return CM.isScalarWithPredication(CI, VF); 7384 }, 7385 Range); 7386 7387 if (IsPredicated) 7388 return nullptr; 7389 7390 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7391 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7392 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7393 return nullptr; 7394 7395 auto willWiden = [&](ElementCount VF) -> bool { 7396 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7397 // The following case may be scalarized depending on the VF. 7398 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7399 // version of the instruction. 7400 // Is it beneficial to perform intrinsic call compared to lib call? 7401 bool NeedToScalarize = false; 7402 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7403 bool UseVectorIntrinsic = 7404 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7405 return UseVectorIntrinsic || !NeedToScalarize; 7406 }; 7407 7408 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7409 return nullptr; 7410 7411 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7412 } 7413 7414 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7415 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7416 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7417 // Instruction should be widened, unless it is scalar after vectorization, 7418 // scalarization is profitable or it is predicated. 7419 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7420 return CM.isScalarAfterVectorization(I, VF) || 7421 CM.isProfitableToScalarize(I, VF) || 7422 CM.isScalarWithPredication(I, VF); 7423 }; 7424 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7425 Range); 7426 } 7427 7428 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7429 auto IsVectorizableOpcode = [](unsigned Opcode) { 7430 switch (Opcode) { 7431 case Instruction::Add: 7432 case Instruction::And: 7433 case Instruction::AShr: 7434 case Instruction::BitCast: 7435 case Instruction::FAdd: 7436 case Instruction::FCmp: 7437 case Instruction::FDiv: 7438 case Instruction::FMul: 7439 case Instruction::FNeg: 7440 case Instruction::FPExt: 7441 case Instruction::FPToSI: 7442 case Instruction::FPToUI: 7443 case Instruction::FPTrunc: 7444 case Instruction::FRem: 7445 case Instruction::FSub: 7446 case Instruction::ICmp: 7447 case Instruction::IntToPtr: 7448 case Instruction::LShr: 7449 case Instruction::Mul: 7450 case Instruction::Or: 7451 case Instruction::PtrToInt: 7452 case Instruction::SDiv: 7453 case Instruction::Select: 7454 case Instruction::SExt: 7455 case Instruction::Shl: 7456 case Instruction::SIToFP: 7457 case Instruction::SRem: 7458 case Instruction::Sub: 7459 case Instruction::Trunc: 7460 case Instruction::UDiv: 7461 case Instruction::UIToFP: 7462 case Instruction::URem: 7463 case Instruction::Xor: 7464 case Instruction::ZExt: 7465 return true; 7466 } 7467 return false; 7468 }; 7469 7470 if (!IsVectorizableOpcode(I->getOpcode())) 7471 return nullptr; 7472 7473 // Success: widen this instruction. 7474 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7475 } 7476 7477 VPBasicBlock *VPRecipeBuilder::handleReplication( 7478 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7479 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7480 VPlanPtr &Plan) { 7481 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7482 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7483 Range); 7484 7485 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7486 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7487 Range); 7488 7489 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7490 IsUniform, IsPredicated); 7491 setRecipe(I, Recipe); 7492 7493 // Find if I uses a predicated instruction. If so, it will use its scalar 7494 // value. Avoid hoisting the insert-element which packs the scalar value into 7495 // a vector value, as that happens iff all users use the vector value. 7496 for (auto &Op : I->operands()) 7497 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7498 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7499 PredInst2Recipe[PredInst]->setAlsoPack(false); 7500 7501 // Finalize the recipe for Instr, first if it is not predicated. 7502 if (!IsPredicated) { 7503 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7504 VPBB->appendRecipe(Recipe); 7505 return VPBB; 7506 } 7507 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7508 assert(VPBB->getSuccessors().empty() && 7509 "VPBB has successors when handling predicated replication."); 7510 // Record predicated instructions for above packing optimizations. 7511 PredInst2Recipe[I] = Recipe; 7512 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7513 VPBlockUtils::insertBlockAfter(Region, VPBB); 7514 auto *RegSucc = new VPBasicBlock(); 7515 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7516 return RegSucc; 7517 } 7518 7519 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7520 VPRecipeBase *PredRecipe, 7521 VPlanPtr &Plan) { 7522 // Instructions marked for predication are replicated and placed under an 7523 // if-then construct to prevent side-effects. 7524 7525 // Generate recipes to compute the block mask for this region. 7526 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7527 7528 // Build the triangular if-then region. 7529 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7530 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7531 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7532 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7533 auto *PHIRecipe = 7534 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7535 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7536 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7537 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7538 7539 // Note: first set Entry as region entry and then connect successors starting 7540 // from it in order, to propagate the "parent" of each VPBasicBlock. 7541 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7542 VPBlockUtils::connectBlocks(Pred, Exit); 7543 7544 return Region; 7545 } 7546 7547 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7548 VFRange &Range, 7549 VPlanPtr &Plan) { 7550 // First, check for specific widening recipes that deal with calls, memory 7551 // operations, inductions and Phi nodes. 7552 if (auto *CI = dyn_cast<CallInst>(Instr)) 7553 return tryToWidenCall(CI, Range, *Plan); 7554 7555 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7556 return tryToWidenMemory(Instr, Range, Plan); 7557 7558 VPRecipeBase *Recipe; 7559 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7560 if (Phi->getParent() != OrigLoop->getHeader()) 7561 return tryToBlend(Phi, Plan); 7562 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7563 return Recipe; 7564 return new VPWidenPHIRecipe(Phi); 7565 } 7566 7567 if (isa<TruncInst>(Instr) && 7568 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7569 return Recipe; 7570 7571 if (!shouldWiden(Instr, Range)) 7572 return nullptr; 7573 7574 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7575 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7576 OrigLoop); 7577 7578 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7579 bool InvariantCond = 7580 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7581 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7582 InvariantCond); 7583 } 7584 7585 return tryToWiden(Instr, *Plan); 7586 } 7587 7588 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7589 unsigned MaxVF) { 7590 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7591 7592 // Collect conditions feeding internal conditional branches; they need to be 7593 // represented in VPlan for it to model masking. 7594 SmallPtrSet<Value *, 1> NeedDef; 7595 7596 auto *Latch = OrigLoop->getLoopLatch(); 7597 for (BasicBlock *BB : OrigLoop->blocks()) { 7598 if (BB == Latch) 7599 continue; 7600 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7601 if (Branch && Branch->isConditional()) 7602 NeedDef.insert(Branch->getCondition()); 7603 } 7604 7605 // If the tail is to be folded by masking, the primary induction variable, if 7606 // exists needs to be represented in VPlan for it to model early-exit masking. 7607 // Also, both the Phi and the live-out instruction of each reduction are 7608 // required in order to introduce a select between them in VPlan. 7609 if (CM.foldTailByMasking()) { 7610 if (Legal->getPrimaryInduction()) 7611 NeedDef.insert(Legal->getPrimaryInduction()); 7612 for (auto &Reduction : Legal->getReductionVars()) { 7613 NeedDef.insert(Reduction.first); 7614 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7615 } 7616 } 7617 7618 // Collect instructions from the original loop that will become trivially dead 7619 // in the vectorized loop. We don't need to vectorize these instructions. For 7620 // example, original induction update instructions can become dead because we 7621 // separately emit induction "steps" when generating code for the new loop. 7622 // Similarly, we create a new latch condition when setting up the structure 7623 // of the new loop, so the old one can become dead. 7624 SmallPtrSet<Instruction *, 4> DeadInstructions; 7625 collectTriviallyDeadInstructions(DeadInstructions); 7626 7627 // Add assume instructions we need to drop to DeadInstructions, to prevent 7628 // them from being added to the VPlan. 7629 // TODO: We only need to drop assumes in blocks that get flattend. If the 7630 // control flow is preserved, we should keep them. 7631 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7632 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7633 7634 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7635 // Dead instructions do not need sinking. Remove them from SinkAfter. 7636 for (Instruction *I : DeadInstructions) 7637 SinkAfter.erase(I); 7638 7639 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7640 VFRange SubRange = {VF, MaxVF + 1}; 7641 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7642 DeadInstructions, SinkAfter)); 7643 VF = SubRange.End; 7644 } 7645 } 7646 7647 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7648 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7649 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7650 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7651 7652 // Hold a mapping from predicated instructions to their recipes, in order to 7653 // fix their AlsoPack behavior if a user is determined to replicate and use a 7654 // scalar instead of vector value. 7655 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7656 7657 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7658 7659 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7660 7661 // --------------------------------------------------------------------------- 7662 // Pre-construction: record ingredients whose recipes we'll need to further 7663 // process after constructing the initial VPlan. 7664 // --------------------------------------------------------------------------- 7665 7666 // Mark instructions we'll need to sink later and their targets as 7667 // ingredients whose recipe we'll need to record. 7668 for (auto &Entry : SinkAfter) { 7669 RecipeBuilder.recordRecipeOf(Entry.first); 7670 RecipeBuilder.recordRecipeOf(Entry.second); 7671 } 7672 for (auto &Reduction : CM.getInLoopReductionChains()) { 7673 PHINode *Phi = Reduction.first; 7674 RecurrenceDescriptor::RecurrenceKind Kind = 7675 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7676 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7677 7678 RecipeBuilder.recordRecipeOf(Phi); 7679 for (auto &R : ReductionOperations) { 7680 RecipeBuilder.recordRecipeOf(R); 7681 // For min/max reducitons, where we have a pair of icmp/select, we also 7682 // need to record the ICmp recipe, so it can be removed later. 7683 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7684 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7685 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7686 } 7687 } 7688 } 7689 7690 // For each interleave group which is relevant for this (possibly trimmed) 7691 // Range, add it to the set of groups to be later applied to the VPlan and add 7692 // placeholders for its members' Recipes which we'll be replacing with a 7693 // single VPInterleaveRecipe. 7694 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7695 auto applyIG = [IG, this](ElementCount VF) -> bool { 7696 return (VF.isVector() && // Query is illegal for VF == 1 7697 CM.getWideningDecision(IG->getInsertPos(), VF) == 7698 LoopVectorizationCostModel::CM_Interleave); 7699 }; 7700 if (!getDecisionAndClampRange(applyIG, Range)) 7701 continue; 7702 InterleaveGroups.insert(IG); 7703 for (unsigned i = 0; i < IG->getFactor(); i++) 7704 if (Instruction *Member = IG->getMember(i)) 7705 RecipeBuilder.recordRecipeOf(Member); 7706 }; 7707 7708 // --------------------------------------------------------------------------- 7709 // Build initial VPlan: Scan the body of the loop in a topological order to 7710 // visit each basic block after having visited its predecessor basic blocks. 7711 // --------------------------------------------------------------------------- 7712 7713 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7714 auto Plan = std::make_unique<VPlan>(); 7715 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7716 Plan->setEntry(VPBB); 7717 7718 // Represent values that will have defs inside VPlan. 7719 for (Value *V : NeedDef) 7720 Plan->addVPValue(V); 7721 7722 // Scan the body of the loop in a topological order to visit each basic block 7723 // after having visited its predecessor basic blocks. 7724 LoopBlocksDFS DFS(OrigLoop); 7725 DFS.perform(LI); 7726 7727 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7728 // Relevant instructions from basic block BB will be grouped into VPRecipe 7729 // ingredients and fill a new VPBasicBlock. 7730 unsigned VPBBsForBB = 0; 7731 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7732 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7733 VPBB = FirstVPBBForBB; 7734 Builder.setInsertPoint(VPBB); 7735 7736 // Introduce each ingredient into VPlan. 7737 // TODO: Model and preserve debug instrinsics in VPlan. 7738 for (Instruction &I : BB->instructionsWithoutDebug()) { 7739 Instruction *Instr = &I; 7740 7741 // First filter out irrelevant instructions, to ensure no recipes are 7742 // built for them. 7743 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7744 continue; 7745 7746 if (auto Recipe = 7747 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7748 RecipeBuilder.setRecipe(Instr, Recipe); 7749 VPBB->appendRecipe(Recipe); 7750 continue; 7751 } 7752 7753 // Otherwise, if all widening options failed, Instruction is to be 7754 // replicated. This may create a successor for VPBB. 7755 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7756 Instr, Range, VPBB, PredInst2Recipe, Plan); 7757 if (NextVPBB != VPBB) { 7758 VPBB = NextVPBB; 7759 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7760 : ""); 7761 } 7762 } 7763 } 7764 7765 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7766 // may also be empty, such as the last one VPBB, reflecting original 7767 // basic-blocks with no recipes. 7768 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7769 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7770 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7771 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7772 delete PreEntry; 7773 7774 // --------------------------------------------------------------------------- 7775 // Transform initial VPlan: Apply previously taken decisions, in order, to 7776 // bring the VPlan to its final state. 7777 // --------------------------------------------------------------------------- 7778 7779 // Apply Sink-After legal constraints. 7780 for (auto &Entry : SinkAfter) { 7781 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7782 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7783 Sink->moveAfter(Target); 7784 } 7785 7786 // Interleave memory: for each Interleave Group we marked earlier as relevant 7787 // for this VPlan, replace the Recipes widening its memory instructions with a 7788 // single VPInterleaveRecipe at its insertion point. 7789 for (auto IG : InterleaveGroups) { 7790 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7791 RecipeBuilder.getRecipe(IG->getInsertPos())); 7792 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7793 ->insertBefore(Recipe); 7794 7795 for (unsigned i = 0; i < IG->getFactor(); ++i) 7796 if (Instruction *Member = IG->getMember(i)) { 7797 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7798 } 7799 } 7800 7801 // Adjust the recipes for any inloop reductions. 7802 if (Range.Start > 1) 7803 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7804 7805 // Finally, if tail is folded by masking, introduce selects between the phi 7806 // and the live-out instruction of each reduction, at the end of the latch. 7807 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7808 Builder.setInsertPoint(VPBB); 7809 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7810 for (auto &Reduction : Legal->getReductionVars()) { 7811 assert(!CM.isInLoopReduction(Reduction.first) && 7812 "Didn't expect inloop tail folded reduction yet!"); 7813 VPValue *Phi = Plan->getVPValue(Reduction.first); 7814 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7815 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7816 } 7817 } 7818 7819 std::string PlanName; 7820 raw_string_ostream RSO(PlanName); 7821 ElementCount VF = ElementCount::getFixed(Range.Start); 7822 Plan->addVF(VF); 7823 RSO << "Initial VPlan for VF={" << VF; 7824 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7825 Plan->addVF(VF); 7826 RSO << "," << VF; 7827 } 7828 RSO << "},UF>=1"; 7829 RSO.flush(); 7830 Plan->setName(PlanName); 7831 7832 return Plan; 7833 } 7834 7835 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7836 // Outer loop handling: They may require CFG and instruction level 7837 // transformations before even evaluating whether vectorization is profitable. 7838 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7839 // the vectorization pipeline. 7840 assert(!OrigLoop->isInnermost()); 7841 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7842 7843 // Create new empty VPlan 7844 auto Plan = std::make_unique<VPlan>(); 7845 7846 // Build hierarchical CFG 7847 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7848 HCFGBuilder.buildHierarchicalCFG(); 7849 7850 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7851 Plan->addVF(ElementCount::getFixed(VF)); 7852 7853 if (EnableVPlanPredication) { 7854 VPlanPredicator VPP(*Plan); 7855 VPP.predicate(); 7856 7857 // Avoid running transformation to recipes until masked code generation in 7858 // VPlan-native path is in place. 7859 return Plan; 7860 } 7861 7862 SmallPtrSet<Instruction *, 1> DeadInstructions; 7863 VPlanTransforms::VPInstructionsToVPRecipes( 7864 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7865 return Plan; 7866 } 7867 7868 // Adjust the recipes for any inloop reductions. The chain of instructions 7869 // leading from the loop exit instr to the phi need to be converted to 7870 // reductions, with one operand being vector and the other being the scalar 7871 // reduction chain. 7872 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7873 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7874 for (auto &Reduction : CM.getInLoopReductionChains()) { 7875 PHINode *Phi = Reduction.first; 7876 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7877 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7878 7879 // ReductionOperations are orders top-down from the phi's use to the 7880 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7881 // which of the two operands will remain scalar and which will be reduced. 7882 // For minmax the chain will be the select instructions. 7883 Instruction *Chain = Phi; 7884 for (Instruction *R : ReductionOperations) { 7885 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7886 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7887 7888 VPValue *ChainOp = Plan->getVPValue(Chain); 7889 unsigned FirstOpId; 7890 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7891 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7892 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7893 "Expected to replace a VPWidenSelectSC"); 7894 FirstOpId = 1; 7895 } else { 7896 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7897 "Expected to replace a VPWidenSC"); 7898 FirstOpId = 0; 7899 } 7900 unsigned VecOpId = 7901 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7902 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7903 7904 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7905 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7906 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7907 WidenRecipe->eraseFromParent(); 7908 7909 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7910 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7911 VPRecipeBase *CompareRecipe = 7912 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7913 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7914 "Expected to replace a VPWidenSC"); 7915 CompareRecipe->eraseFromParent(); 7916 } 7917 Chain = R; 7918 } 7919 } 7920 } 7921 7922 Value* LoopVectorizationPlanner::VPCallbackILV:: 7923 getOrCreateVectorValues(Value *V, unsigned Part) { 7924 return ILV.getOrCreateVectorValue(V, Part); 7925 } 7926 7927 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7928 Value *V, const VPIteration &Instance) { 7929 return ILV.getOrCreateScalarValue(V, Instance); 7930 } 7931 7932 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7933 VPSlotTracker &SlotTracker) const { 7934 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7935 IG->getInsertPos()->printAsOperand(O, false); 7936 O << ", "; 7937 getAddr()->printAsOperand(O, SlotTracker); 7938 VPValue *Mask = getMask(); 7939 if (Mask) { 7940 O << ", "; 7941 Mask->printAsOperand(O, SlotTracker); 7942 } 7943 for (unsigned i = 0; i < IG->getFactor(); ++i) 7944 if (Instruction *I = IG->getMember(i)) 7945 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7946 } 7947 7948 void VPWidenCallRecipe::execute(VPTransformState &State) { 7949 State.ILV->widenCallInstruction(Ingredient, *this, State); 7950 } 7951 7952 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7953 State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State); 7954 } 7955 7956 void VPWidenRecipe::execute(VPTransformState &State) { 7957 State.ILV->widenInstruction(Ingredient, *this, State); 7958 } 7959 7960 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7961 State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant, 7962 IsIndexLoopInvariant, State); 7963 } 7964 7965 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7966 assert(!State.Instance && "Int or FP induction being replicated."); 7967 State.ILV->widenIntOrFpInduction(IV, Trunc); 7968 } 7969 7970 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7971 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7972 } 7973 7974 void VPBlendRecipe::execute(VPTransformState &State) { 7975 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7976 // We know that all PHIs in non-header blocks are converted into 7977 // selects, so we don't have to worry about the insertion order and we 7978 // can just use the builder. 7979 // At this point we generate the predication tree. There may be 7980 // duplications since this is a simple recursive scan, but future 7981 // optimizations will clean it up. 7982 7983 unsigned NumIncoming = getNumIncomingValues(); 7984 7985 // Generate a sequence of selects of the form: 7986 // SELECT(Mask3, In3, 7987 // SELECT(Mask2, In2, 7988 // SELECT(Mask1, In1, 7989 // In0))) 7990 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7991 // are essentially undef are taken from In0. 7992 InnerLoopVectorizer::VectorParts Entry(State.UF); 7993 for (unsigned In = 0; In < NumIncoming; ++In) { 7994 for (unsigned Part = 0; Part < State.UF; ++Part) { 7995 // We might have single edge PHIs (blocks) - use an identity 7996 // 'select' for the first PHI operand. 7997 Value *In0 = State.get(getIncomingValue(In), Part); 7998 if (In == 0) 7999 Entry[Part] = In0; // Initialize with the first incoming value. 8000 else { 8001 // Select between the current value and the previous incoming edge 8002 // based on the incoming mask. 8003 Value *Cond = State.get(getMask(In), Part); 8004 Entry[Part] = 8005 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8006 } 8007 } 8008 } 8009 for (unsigned Part = 0; Part < State.UF; ++Part) 8010 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8011 } 8012 8013 void VPInterleaveRecipe::execute(VPTransformState &State) { 8014 assert(!State.Instance && "Interleave group being replicated."); 8015 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 8016 } 8017 8018 void VPReductionRecipe::execute(VPTransformState &State) { 8019 assert(!State.Instance && "Reduction being replicated."); 8020 for (unsigned Part = 0; Part < State.UF; ++Part) { 8021 unsigned Kind = RdxDesc->getRecurrenceKind(); 8022 Value *NewVecOp = State.get(VecOp, Part); 8023 Value *NewRed = 8024 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8025 Value *PrevInChain = State.get(ChainOp, Part); 8026 Value *NextInChain; 8027 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8028 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8029 NextInChain = 8030 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8031 NewRed, PrevInChain); 8032 } else { 8033 NextInChain = State.Builder.CreateBinOp( 8034 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8035 } 8036 State.ValueMap.setVectorValue(I, Part, NextInChain); 8037 } 8038 } 8039 8040 void VPReplicateRecipe::execute(VPTransformState &State) { 8041 if (State.Instance) { // Generate a single instance. 8042 State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, 8043 IsPredicated, State); 8044 // Insert scalar instance packing it into a vector. 8045 if (AlsoPack && State.VF.isVector()) { 8046 // If we're constructing lane 0, initialize to start from undef. 8047 if (State.Instance->Lane == 0) { 8048 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8049 Value *Undef = 8050 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8051 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8052 } 8053 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8054 } 8055 return; 8056 } 8057 8058 // Generate scalar instances for all VF lanes of all UF parts, unless the 8059 // instruction is uniform inwhich case generate only the first lane for each 8060 // of the UF parts. 8061 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8062 for (unsigned Part = 0; Part < State.UF; ++Part) 8063 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8064 State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane}, 8065 IsPredicated, State); 8066 } 8067 8068 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8069 assert(State.Instance && "Branch on Mask works only on single instance."); 8070 8071 unsigned Part = State.Instance->Part; 8072 unsigned Lane = State.Instance->Lane; 8073 8074 Value *ConditionBit = nullptr; 8075 VPValue *BlockInMask = getMask(); 8076 if (BlockInMask) { 8077 ConditionBit = State.get(BlockInMask, Part); 8078 if (ConditionBit->getType()->isVectorTy()) 8079 ConditionBit = State.Builder.CreateExtractElement( 8080 ConditionBit, State.Builder.getInt32(Lane)); 8081 } else // Block in mask is all-one. 8082 ConditionBit = State.Builder.getTrue(); 8083 8084 // Replace the temporary unreachable terminator with a new conditional branch, 8085 // whose two destinations will be set later when they are created. 8086 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8087 assert(isa<UnreachableInst>(CurrentTerminator) && 8088 "Expected to replace unreachable terminator with conditional branch."); 8089 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8090 CondBr->setSuccessor(0, nullptr); 8091 ReplaceInstWithInst(CurrentTerminator, CondBr); 8092 } 8093 8094 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8095 assert(State.Instance && "Predicated instruction PHI works per instance."); 8096 Instruction *ScalarPredInst = cast<Instruction>( 8097 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8098 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8099 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8100 assert(PredicatingBB && "Predicated block has no single predecessor."); 8101 8102 // By current pack/unpack logic we need to generate only a single phi node: if 8103 // a vector value for the predicated instruction exists at this point it means 8104 // the instruction has vector users only, and a phi for the vector value is 8105 // needed. In this case the recipe of the predicated instruction is marked to 8106 // also do that packing, thereby "hoisting" the insert-element sequence. 8107 // Otherwise, a phi node for the scalar value is needed. 8108 unsigned Part = State.Instance->Part; 8109 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8110 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8111 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8112 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8113 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8114 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8115 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8116 } else { 8117 Type *PredInstType = PredInst->getType(); 8118 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8119 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8120 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8121 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8122 } 8123 } 8124 8125 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8126 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8127 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8128 getMask()); 8129 } 8130 8131 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8132 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8133 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8134 // for predication. 8135 static ScalarEpilogueLowering getScalarEpilogueLowering( 8136 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8137 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8138 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8139 LoopVectorizationLegality &LVL) { 8140 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8141 // don't look at hints or options, and don't request a scalar epilogue. 8142 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8143 // LoopAccessInfo (due to code dependency and not being able to reliably get 8144 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8145 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8146 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8147 // back to the old way and vectorize with versioning when forced. See D81345.) 8148 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8149 PGSOQueryType::IRPass) && 8150 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8151 return CM_ScalarEpilogueNotAllowedOptSize; 8152 8153 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8154 !PreferPredicateOverEpilogue; 8155 8156 // 2) Next, if disabling predication is requested on the command line, honour 8157 // this and request a scalar epilogue. 8158 if (PredicateOptDisabled) 8159 return CM_ScalarEpilogueAllowed; 8160 8161 // 3) and 4) look if enabling predication is requested on the command line, 8162 // with a loop hint, or if the TTI hook indicates this is profitable, request 8163 // predication. 8164 if (PreferPredicateOverEpilogue || 8165 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8166 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8167 LVL.getLAI()) && 8168 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8169 return CM_ScalarEpilogueNotNeededUsePredicate; 8170 8171 return CM_ScalarEpilogueAllowed; 8172 } 8173 8174 // Process the loop in the VPlan-native vectorization path. This path builds 8175 // VPlan upfront in the vectorization pipeline, which allows to apply 8176 // VPlan-to-VPlan transformations from the very beginning without modifying the 8177 // input LLVM IR. 8178 static bool processLoopInVPlanNativePath( 8179 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8180 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8181 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8182 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8183 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8184 8185 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8186 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8187 return false; 8188 } 8189 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8190 Function *F = L->getHeader()->getParent(); 8191 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8192 8193 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8194 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8195 8196 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8197 &Hints, IAI); 8198 // Use the planner for outer loop vectorization. 8199 // TODO: CM is not used at this point inside the planner. Turn CM into an 8200 // optional argument if we don't need it in the future. 8201 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8202 8203 // Get user vectorization factor. 8204 const unsigned UserVF = Hints.getWidth(); 8205 8206 // Plan how to best vectorize, return the best VF and its cost. 8207 const VectorizationFactor VF = 8208 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8209 8210 // If we are stress testing VPlan builds, do not attempt to generate vector 8211 // code. Masked vector code generation support will follow soon. 8212 // Also, do not attempt to vectorize if no vector code will be produced. 8213 if (VPlanBuildStressTest || EnableVPlanPredication || 8214 VectorizationFactor::Disabled() == VF) 8215 return false; 8216 8217 LVP.setBestPlan(VF.Width, 1); 8218 8219 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8220 &CM, BFI, PSI); 8221 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8222 << L->getHeader()->getParent()->getName() << "\"\n"); 8223 LVP.executePlan(LB, DT); 8224 8225 // Mark the loop as already vectorized to avoid vectorizing again. 8226 Hints.setAlreadyVectorized(); 8227 8228 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8229 return true; 8230 } 8231 8232 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8233 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8234 !EnableLoopInterleaving), 8235 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8236 !EnableLoopVectorization) {} 8237 8238 bool LoopVectorizePass::processLoop(Loop *L) { 8239 assert((EnableVPlanNativePath || L->isInnermost()) && 8240 "VPlan-native path is not enabled. Only process inner loops."); 8241 8242 #ifndef NDEBUG 8243 const std::string DebugLocStr = getDebugLocString(L); 8244 #endif /* NDEBUG */ 8245 8246 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8247 << L->getHeader()->getParent()->getName() << "\" from " 8248 << DebugLocStr << "\n"); 8249 8250 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8251 8252 LLVM_DEBUG( 8253 dbgs() << "LV: Loop hints:" 8254 << " force=" 8255 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8256 ? "disabled" 8257 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8258 ? "enabled" 8259 : "?")) 8260 << " width=" << Hints.getWidth() 8261 << " unroll=" << Hints.getInterleave() << "\n"); 8262 8263 // Function containing loop 8264 Function *F = L->getHeader()->getParent(); 8265 8266 // Looking at the diagnostic output is the only way to determine if a loop 8267 // was vectorized (other than looking at the IR or machine code), so it 8268 // is important to generate an optimization remark for each loop. Most of 8269 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8270 // generated as OptimizationRemark and OptimizationRemarkMissed are 8271 // less verbose reporting vectorized loops and unvectorized loops that may 8272 // benefit from vectorization, respectively. 8273 8274 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8275 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8276 return false; 8277 } 8278 8279 PredicatedScalarEvolution PSE(*SE, *L); 8280 8281 // Check if it is legal to vectorize the loop. 8282 LoopVectorizationRequirements Requirements(*ORE); 8283 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8284 &Requirements, &Hints, DB, AC, BFI, PSI); 8285 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8286 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8287 Hints.emitRemarkWithHints(); 8288 return false; 8289 } 8290 8291 // Check the function attributes and profiles to find out if this function 8292 // should be optimized for size. 8293 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8294 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8295 8296 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8297 // here. They may require CFG and instruction level transformations before 8298 // even evaluating whether vectorization is profitable. Since we cannot modify 8299 // the incoming IR, we need to build VPlan upfront in the vectorization 8300 // pipeline. 8301 if (!L->isInnermost()) 8302 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8303 ORE, BFI, PSI, Hints); 8304 8305 assert(L->isInnermost() && "Inner loop expected."); 8306 8307 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8308 // count by optimizing for size, to minimize overheads. 8309 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8310 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8311 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8312 << "This loop is worth vectorizing only if no scalar " 8313 << "iteration overheads are incurred."); 8314 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8315 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8316 else { 8317 LLVM_DEBUG(dbgs() << "\n"); 8318 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8319 } 8320 } 8321 8322 // Check the function attributes to see if implicit floats are allowed. 8323 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8324 // an integer loop and the vector instructions selected are purely integer 8325 // vector instructions? 8326 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8327 reportVectorizationFailure( 8328 "Can't vectorize when the NoImplicitFloat attribute is used", 8329 "loop not vectorized due to NoImplicitFloat attribute", 8330 "NoImplicitFloat", ORE, L); 8331 Hints.emitRemarkWithHints(); 8332 return false; 8333 } 8334 8335 // Check if the target supports potentially unsafe FP vectorization. 8336 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8337 // for the target we're vectorizing for, to make sure none of the 8338 // additional fp-math flags can help. 8339 if (Hints.isPotentiallyUnsafe() && 8340 TTI->isFPVectorizationPotentiallyUnsafe()) { 8341 reportVectorizationFailure( 8342 "Potentially unsafe FP op prevents vectorization", 8343 "loop not vectorized due to unsafe FP support.", 8344 "UnsafeFP", ORE, L); 8345 Hints.emitRemarkWithHints(); 8346 return false; 8347 } 8348 8349 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8350 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8351 8352 // If an override option has been passed in for interleaved accesses, use it. 8353 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8354 UseInterleaved = EnableInterleavedMemAccesses; 8355 8356 // Analyze interleaved memory accesses. 8357 if (UseInterleaved) { 8358 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8359 } 8360 8361 // Use the cost model. 8362 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8363 F, &Hints, IAI); 8364 CM.collectValuesToIgnore(); 8365 8366 // Use the planner for vectorization. 8367 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8368 8369 // Get user vectorization factor and interleave count. 8370 unsigned UserVF = Hints.getWidth(); 8371 unsigned UserIC = Hints.getInterleave(); 8372 8373 // Plan how to best vectorize, return the best VF and its cost. 8374 Optional<VectorizationFactor> MaybeVF = 8375 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8376 8377 VectorizationFactor VF = VectorizationFactor::Disabled(); 8378 unsigned IC = 1; 8379 8380 if (MaybeVF) { 8381 VF = *MaybeVF; 8382 // Select the interleave count. 8383 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8384 } 8385 8386 // Identify the diagnostic messages that should be produced. 8387 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8388 bool VectorizeLoop = true, InterleaveLoop = true; 8389 if (Requirements.doesNotMeet(F, L, Hints)) { 8390 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8391 "requirements.\n"); 8392 Hints.emitRemarkWithHints(); 8393 return false; 8394 } 8395 8396 if (VF.Width == 1) { 8397 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8398 VecDiagMsg = std::make_pair( 8399 "VectorizationNotBeneficial", 8400 "the cost-model indicates that vectorization is not beneficial"); 8401 VectorizeLoop = false; 8402 } 8403 8404 if (!MaybeVF && UserIC > 1) { 8405 // Tell the user interleaving was avoided up-front, despite being explicitly 8406 // requested. 8407 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8408 "interleaving should be avoided up front\n"); 8409 IntDiagMsg = std::make_pair( 8410 "InterleavingAvoided", 8411 "Ignoring UserIC, because interleaving was avoided up front"); 8412 InterleaveLoop = false; 8413 } else if (IC == 1 && UserIC <= 1) { 8414 // Tell the user interleaving is not beneficial. 8415 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8416 IntDiagMsg = std::make_pair( 8417 "InterleavingNotBeneficial", 8418 "the cost-model indicates that interleaving is not beneficial"); 8419 InterleaveLoop = false; 8420 if (UserIC == 1) { 8421 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8422 IntDiagMsg.second += 8423 " and is explicitly disabled or interleave count is set to 1"; 8424 } 8425 } else if (IC > 1 && UserIC == 1) { 8426 // Tell the user interleaving is beneficial, but it explicitly disabled. 8427 LLVM_DEBUG( 8428 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8429 IntDiagMsg = std::make_pair( 8430 "InterleavingBeneficialButDisabled", 8431 "the cost-model indicates that interleaving is beneficial " 8432 "but is explicitly disabled or interleave count is set to 1"); 8433 InterleaveLoop = false; 8434 } 8435 8436 // Override IC if user provided an interleave count. 8437 IC = UserIC > 0 ? UserIC : IC; 8438 8439 // Emit diagnostic messages, if any. 8440 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8441 if (!VectorizeLoop && !InterleaveLoop) { 8442 // Do not vectorize or interleaving the loop. 8443 ORE->emit([&]() { 8444 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8445 L->getStartLoc(), L->getHeader()) 8446 << VecDiagMsg.second; 8447 }); 8448 ORE->emit([&]() { 8449 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8450 L->getStartLoc(), L->getHeader()) 8451 << IntDiagMsg.second; 8452 }); 8453 return false; 8454 } else if (!VectorizeLoop && InterleaveLoop) { 8455 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8456 ORE->emit([&]() { 8457 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8458 L->getStartLoc(), L->getHeader()) 8459 << VecDiagMsg.second; 8460 }); 8461 } else if (VectorizeLoop && !InterleaveLoop) { 8462 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8463 << ") in " << DebugLocStr << '\n'); 8464 ORE->emit([&]() { 8465 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8466 L->getStartLoc(), L->getHeader()) 8467 << IntDiagMsg.second; 8468 }); 8469 } else if (VectorizeLoop && InterleaveLoop) { 8470 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8471 << ") in " << DebugLocStr << '\n'); 8472 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8473 } 8474 8475 LVP.setBestPlan(VF.Width, IC); 8476 8477 using namespace ore; 8478 bool DisableRuntimeUnroll = false; 8479 MDNode *OrigLoopID = L->getLoopID(); 8480 8481 if (!VectorizeLoop) { 8482 assert(IC > 1 && "interleave count should not be 1 or 0"); 8483 // If we decided that it is not legal to vectorize the loop, then 8484 // interleave it. 8485 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8486 BFI, PSI); 8487 LVP.executePlan(Unroller, DT); 8488 8489 ORE->emit([&]() { 8490 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8491 L->getHeader()) 8492 << "interleaved loop (interleaved count: " 8493 << NV("InterleaveCount", IC) << ")"; 8494 }); 8495 } else { 8496 // If we decided that it is *legal* to vectorize the loop, then do it. 8497 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8498 &LVL, &CM, BFI, PSI); 8499 LVP.executePlan(LB, DT); 8500 ++LoopsVectorized; 8501 8502 // Add metadata to disable runtime unrolling a scalar loop when there are 8503 // no runtime checks about strides and memory. A scalar loop that is 8504 // rarely used is not worth unrolling. 8505 if (!LB.areSafetyChecksAdded()) 8506 DisableRuntimeUnroll = true; 8507 8508 // Report the vectorization decision. 8509 ORE->emit([&]() { 8510 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8511 L->getHeader()) 8512 << "vectorized loop (vectorization width: " 8513 << NV("VectorizationFactor", VF.Width) 8514 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8515 }); 8516 } 8517 8518 Optional<MDNode *> RemainderLoopID = 8519 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8520 LLVMLoopVectorizeFollowupEpilogue}); 8521 if (RemainderLoopID.hasValue()) { 8522 L->setLoopID(RemainderLoopID.getValue()); 8523 } else { 8524 if (DisableRuntimeUnroll) 8525 AddRuntimeUnrollDisableMetaData(L); 8526 8527 // Mark the loop as already vectorized to avoid vectorizing again. 8528 Hints.setAlreadyVectorized(); 8529 } 8530 8531 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8532 return true; 8533 } 8534 8535 LoopVectorizeResult LoopVectorizePass::runImpl( 8536 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8537 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8538 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8539 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8540 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8541 SE = &SE_; 8542 LI = &LI_; 8543 TTI = &TTI_; 8544 DT = &DT_; 8545 BFI = &BFI_; 8546 TLI = TLI_; 8547 AA = &AA_; 8548 AC = &AC_; 8549 GetLAA = &GetLAA_; 8550 DB = &DB_; 8551 ORE = &ORE_; 8552 PSI = PSI_; 8553 8554 // Don't attempt if 8555 // 1. the target claims to have no vector registers, and 8556 // 2. interleaving won't help ILP. 8557 // 8558 // The second condition is necessary because, even if the target has no 8559 // vector registers, loop vectorization may still enable scalar 8560 // interleaving. 8561 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8562 TTI->getMaxInterleaveFactor(1) < 2) 8563 return LoopVectorizeResult(false, false); 8564 8565 bool Changed = false, CFGChanged = false; 8566 8567 // The vectorizer requires loops to be in simplified form. 8568 // Since simplification may add new inner loops, it has to run before the 8569 // legality and profitability checks. This means running the loop vectorizer 8570 // will simplify all loops, regardless of whether anything end up being 8571 // vectorized. 8572 for (auto &L : *LI) 8573 Changed |= CFGChanged |= 8574 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8575 8576 // Build up a worklist of inner-loops to vectorize. This is necessary as 8577 // the act of vectorizing or partially unrolling a loop creates new loops 8578 // and can invalidate iterators across the loops. 8579 SmallVector<Loop *, 8> Worklist; 8580 8581 for (Loop *L : *LI) 8582 collectSupportedLoops(*L, LI, ORE, Worklist); 8583 8584 LoopsAnalyzed += Worklist.size(); 8585 8586 // Now walk the identified inner loops. 8587 while (!Worklist.empty()) { 8588 Loop *L = Worklist.pop_back_val(); 8589 8590 // For the inner loops we actually process, form LCSSA to simplify the 8591 // transform. 8592 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8593 8594 Changed |= CFGChanged |= processLoop(L); 8595 } 8596 8597 // Process each loop nest in the function. 8598 return LoopVectorizeResult(Changed, CFGChanged); 8599 } 8600 8601 PreservedAnalyses LoopVectorizePass::run(Function &F, 8602 FunctionAnalysisManager &AM) { 8603 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8604 auto &LI = AM.getResult<LoopAnalysis>(F); 8605 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8606 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8607 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8608 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8609 auto &AA = AM.getResult<AAManager>(F); 8610 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8611 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8612 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8613 MemorySSA *MSSA = EnableMSSALoopDependency 8614 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8615 : nullptr; 8616 8617 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8618 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8619 [&](Loop &L) -> const LoopAccessInfo & { 8620 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 8621 TLI, TTI, nullptr, MSSA}; 8622 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8623 }; 8624 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8625 ProfileSummaryInfo *PSI = 8626 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8627 LoopVectorizeResult Result = 8628 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8629 if (!Result.MadeAnyChange) 8630 return PreservedAnalyses::all(); 8631 PreservedAnalyses PA; 8632 8633 // We currently do not preserve loopinfo/dominator analyses with outer loop 8634 // vectorization. Until this is addressed, mark these analyses as preserved 8635 // only for non-VPlan-native path. 8636 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8637 if (!EnableVPlanNativePath) { 8638 PA.preserve<LoopAnalysis>(); 8639 PA.preserve<DominatorTreeAnalysis>(); 8640 } 8641 PA.preserve<BasicAA>(); 8642 PA.preserve<GlobalsAA>(); 8643 if (!Result.MadeCFGChange) 8644 PA.preserveSet<CFGAnalyses>(); 8645 return PA; 8646 } 8647