1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } // namespace PreferPredicateTy 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// Interleave small loops with scalar reductions. 274 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 276 cl::desc("Enable interleaving for loops with small iteration counts that " 277 "contain scalar reductions to expose ILP.")); 278 279 /// The number of stores in a loop that are allowed to need predication. 280 static cl::opt<unsigned> NumberOfStoresToPredicate( 281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 282 cl::desc("Max number of stores to be predicated behind an if.")); 283 284 static cl::opt<bool> EnableIndVarRegisterHeur( 285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 286 cl::desc("Count the induction variable only once when interleaving")); 287 288 static cl::opt<bool> EnableCondStoresVectorization( 289 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 290 cl::desc("Enable if predication of stores during vectorization.")); 291 292 static cl::opt<unsigned> MaxNestedScalarReductionIC( 293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 294 cl::desc("The maximum interleave count to use when interleaving a scalar " 295 "reduction in a nested loop.")); 296 297 static cl::opt<bool> 298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 299 cl::Hidden, 300 cl::desc("Prefer in-loop vector reductions, " 301 "overriding the targets preference.")); 302 303 static cl::opt<bool> PreferPredicatedReductionSelect( 304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 305 cl::desc( 306 "Prefer predicating a reduction operation over an after loop select.")); 307 308 cl::opt<bool> EnableVPlanNativePath( 309 "enable-vplan-native-path", cl::init(false), cl::Hidden, 310 cl::desc("Enable VPlan-native vectorization path with " 311 "support for outer loop vectorization.")); 312 313 // FIXME: Remove this switch once we have divergence analysis. Currently we 314 // assume divergent non-backedge branches when this switch is true. 315 cl::opt<bool> EnableVPlanPredication( 316 "enable-vplan-predication", cl::init(false), cl::Hidden, 317 cl::desc("Enable VPlan-native vectorization path predicator with " 318 "support for outer loop vectorization.")); 319 320 // This flag enables the stress testing of the VPlan H-CFG construction in the 321 // VPlan-native vectorization path. It must be used in conjuction with 322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 323 // verification of the H-CFGs built. 324 static cl::opt<bool> VPlanBuildStressTest( 325 "vplan-build-stress-test", cl::init(false), cl::Hidden, 326 cl::desc( 327 "Build VPlan for every supported loop nest in the function and bail " 328 "out right after the build (stress test the VPlan H-CFG construction " 329 "in the VPlan-native vectorization path).")); 330 331 cl::opt<bool> llvm::EnableLoopInterleaving( 332 "interleave-loops", cl::init(true), cl::Hidden, 333 cl::desc("Enable loop interleaving in Loop vectorization passes")); 334 cl::opt<bool> llvm::EnableLoopVectorization( 335 "vectorize-loops", cl::init(true), cl::Hidden, 336 cl::desc("Run the Loop vectorization passes")); 337 338 /// A helper function that returns the type of loaded or stored value. 339 static Type *getMemInstValueType(Value *I) { 340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 341 "Expected Load or Store instruction"); 342 if (auto *LI = dyn_cast<LoadInst>(I)) 343 return LI->getType(); 344 return cast<StoreInst>(I)->getValueOperand()->getType(); 345 } 346 347 /// A helper function that returns true if the given type is irregular. The 348 /// type is irregular if its allocated size doesn't equal the store size of an 349 /// element of the corresponding vector type at the given vectorization factor. 350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 351 assert(!VF.isScalable() && "scalable vectors not yet supported."); 352 // Determine if an array of VF elements of type Ty is "bitcast compatible" 353 // with a <VF x Ty> vector. 354 if (VF.isVector()) { 355 auto *VectorTy = VectorType::get(Ty, VF); 356 return TypeSize::get(VF.getKnownMinValue() * 357 DL.getTypeAllocSize(Ty).getFixedValue(), 358 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 359 } 360 361 // If the vectorization factor is one, we just check if an array of type Ty 362 // requires padding between elements. 363 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 364 } 365 366 /// A helper function that returns the reciprocal of the block probability of 367 /// predicated blocks. If we return X, we are assuming the predicated block 368 /// will execute once for every X iterations of the loop header. 369 /// 370 /// TODO: We should use actual block probability here, if available. Currently, 371 /// we always assume predicated blocks have a 50% chance of executing. 372 static unsigned getReciprocalPredBlockProb() { return 2; } 373 374 /// A helper function that adds a 'fast' flag to floating-point operations. 375 static Value *addFastMathFlag(Value *V) { 376 if (isa<FPMathOperator>(V)) 377 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 378 return V; 379 } 380 381 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 382 if (isa<FPMathOperator>(V)) 383 cast<Instruction>(V)->setFastMathFlags(FMF); 384 return V; 385 } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 namespace llvm { 418 419 /// InnerLoopVectorizer vectorizes loops which contain only one basic 420 /// block to a specified vectorization factor (VF). 421 /// This class performs the widening of scalars into vectors, or multiple 422 /// scalars. This class also implements the following features: 423 /// * It inserts an epilogue loop for handling loops that don't have iteration 424 /// counts that are known to be a multiple of the vectorization factor. 425 /// * It handles the code generation for reduction variables. 426 /// * Scalarization (implementation using scalars) of un-vectorizable 427 /// instructions. 428 /// InnerLoopVectorizer does not perform any vectorization-legality 429 /// checks, and relies on the caller to check for the different legality 430 /// aspects. The InnerLoopVectorizer relies on the 431 /// LoopVectorizationLegality class to provide information about the induction 432 /// and reduction variables that were found to a given vectorization factor. 433 class InnerLoopVectorizer { 434 public: 435 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 436 LoopInfo *LI, DominatorTree *DT, 437 const TargetLibraryInfo *TLI, 438 const TargetTransformInfo *TTI, AssumptionCache *AC, 439 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 440 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 441 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 442 ProfileSummaryInfo *PSI) 443 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 444 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 445 Builder(PSE.getSE()->getContext()), 446 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 447 BFI(BFI), PSI(PSI) { 448 // Query this against the original loop and save it here because the profile 449 // of the original loop header may change as the transformation happens. 450 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 451 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 452 } 453 454 virtual ~InnerLoopVectorizer() = default; 455 456 /// Create a new empty loop that will contain vectorized instructions later 457 /// on, while the old loop will be used as the scalar remainder. Control flow 458 /// is generated around the vectorized (and scalar epilogue) loops consisting 459 /// of various checks and bypasses. Return the pre-header block of the new 460 /// loop. 461 BasicBlock *createVectorizedLoopSkeleton(); 462 463 /// Widen a single instruction within the innermost loop. 464 void widenInstruction(Instruction &I, VPUser &Operands, 465 VPTransformState &State); 466 467 /// Widen a single call instruction within the innermost loop. 468 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 469 VPTransformState &State); 470 471 /// Widen a single select instruction within the innermost loop. 472 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 473 bool InvariantCond, VPTransformState &State); 474 475 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 476 void fixVectorizedLoop(); 477 478 // Return true if any runtime check is added. 479 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 480 481 /// A type for vectorized values in the new loop. Each value from the 482 /// original loop, when vectorized, is represented by UF vector values in the 483 /// new unrolled loop, where UF is the unroll factor. 484 using VectorParts = SmallVector<Value *, 2>; 485 486 /// Vectorize a single GetElementPtrInst based on information gathered and 487 /// decisions taken during planning. 488 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 489 ElementCount VF, bool IsPtrLoopInvariant, 490 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 491 492 /// Vectorize a single PHINode in a block. This method handles the induction 493 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 494 /// arbitrary length vectors. 495 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 496 497 /// A helper function to scalarize a single Instruction in the innermost loop. 498 /// Generates a sequence of scalar instances for each lane between \p MinLane 499 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 500 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 501 /// Instr's operands. 502 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 503 const VPIteration &Instance, bool IfPredicateInstr, 504 VPTransformState &State); 505 506 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 507 /// is provided, the integer induction variable will first be truncated to 508 /// the corresponding type. 509 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 510 511 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 512 /// vector or scalar value on-demand if one is not yet available. When 513 /// vectorizing a loop, we visit the definition of an instruction before its 514 /// uses. When visiting the definition, we either vectorize or scalarize the 515 /// instruction, creating an entry for it in the corresponding map. (In some 516 /// cases, such as induction variables, we will create both vector and scalar 517 /// entries.) Then, as we encounter uses of the definition, we derive values 518 /// for each scalar or vector use unless such a value is already available. 519 /// For example, if we scalarize a definition and one of its uses is vector, 520 /// we build the required vector on-demand with an insertelement sequence 521 /// when visiting the use. Otherwise, if the use is scalar, we can use the 522 /// existing scalar definition. 523 /// 524 /// Return a value in the new loop corresponding to \p V from the original 525 /// loop at unroll index \p Part. If the value has already been vectorized, 526 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 527 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 528 /// a new vector value on-demand by inserting the scalar values into a vector 529 /// with an insertelement sequence. If the value has been neither vectorized 530 /// nor scalarized, it must be loop invariant, so we simply broadcast the 531 /// value into a vector. 532 Value *getOrCreateVectorValue(Value *V, unsigned Part); 533 534 /// Return a value in the new loop corresponding to \p V from the original 535 /// loop at unroll and vector indices \p Instance. If the value has been 536 /// vectorized but not scalarized, the necessary extractelement instruction 537 /// will be generated. 538 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 539 540 /// Construct the vector value of a scalarized value \p V one lane at a time. 541 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 542 543 /// Try to vectorize interleaved access group \p Group with the base address 544 /// given in \p Addr, optionally masking the vector operations if \p 545 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 546 /// values in the vectorized loop. 547 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 548 VPTransformState &State, VPValue *Addr, 549 VPValue *BlockInMask = nullptr); 550 551 /// Vectorize Load and Store instructions with the base address given in \p 552 /// Addr, optionally masking the vector operations if \p BlockInMask is 553 /// non-null. Use \p State to translate given VPValues to IR values in the 554 /// vectorized loop. 555 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 556 VPValue *Addr, VPValue *StoredValue, 557 VPValue *BlockInMask); 558 559 /// Set the debug location in the builder using the debug location in 560 /// the instruction. 561 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 562 563 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 564 void fixNonInductionPHIs(void); 565 566 protected: 567 friend class LoopVectorizationPlanner; 568 569 /// A small list of PHINodes. 570 using PhiVector = SmallVector<PHINode *, 4>; 571 572 /// A type for scalarized values in the new loop. Each value from the 573 /// original loop, when scalarized, is represented by UF x VF scalar values 574 /// in the new unrolled loop, where UF is the unroll factor and VF is the 575 /// vectorization factor. 576 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 577 578 /// Set up the values of the IVs correctly when exiting the vector loop. 579 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 580 Value *CountRoundDown, Value *EndValue, 581 BasicBlock *MiddleBlock); 582 583 /// Create a new induction variable inside L. 584 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 585 Value *Step, Instruction *DL); 586 587 /// Handle all cross-iteration phis in the header. 588 void fixCrossIterationPHIs(); 589 590 /// Fix a first-order recurrence. This is the second phase of vectorizing 591 /// this phi node. 592 void fixFirstOrderRecurrence(PHINode *Phi); 593 594 /// Fix a reduction cross-iteration phi. This is the second phase of 595 /// vectorizing this phi node. 596 void fixReduction(PHINode *Phi); 597 598 /// Clear NSW/NUW flags from reduction instructions if necessary. 599 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 600 601 /// The Loop exit block may have single value PHI nodes with some 602 /// incoming value. While vectorizing we only handled real values 603 /// that were defined inside the loop and we should have one value for 604 /// each predecessor of its parent basic block. See PR14725. 605 void fixLCSSAPHIs(); 606 607 /// Iteratively sink the scalarized operands of a predicated instruction into 608 /// the block that was created for it. 609 void sinkScalarOperands(Instruction *PredInst); 610 611 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 612 /// represented as. 613 void truncateToMinimalBitwidths(); 614 615 /// Create a broadcast instruction. This method generates a broadcast 616 /// instruction (shuffle) for loop invariant values and for the induction 617 /// value. If this is the induction variable then we extend it to N, N+1, ... 618 /// this is needed because each iteration in the loop corresponds to a SIMD 619 /// element. 620 virtual Value *getBroadcastInstrs(Value *V); 621 622 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 623 /// to each vector element of Val. The sequence starts at StartIndex. 624 /// \p Opcode is relevant for FP induction variable. 625 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 626 Instruction::BinaryOps Opcode = 627 Instruction::BinaryOpsEnd); 628 629 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 630 /// variable on which to base the steps, \p Step is the size of the step, and 631 /// \p EntryVal is the value from the original loop that maps to the steps. 632 /// Note that \p EntryVal doesn't have to be an induction variable - it 633 /// can also be a truncate instruction. 634 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 635 const InductionDescriptor &ID); 636 637 /// Create a vector induction phi node based on an existing scalar one. \p 638 /// EntryVal is the value from the original loop that maps to the vector phi 639 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 640 /// truncate instruction, instead of widening the original IV, we widen a 641 /// version of the IV truncated to \p EntryVal's type. 642 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 643 Value *Step, Instruction *EntryVal); 644 645 /// Returns true if an instruction \p I should be scalarized instead of 646 /// vectorized for the chosen vectorization factor. 647 bool shouldScalarizeInstruction(Instruction *I) const; 648 649 /// Returns true if we should generate a scalar version of \p IV. 650 bool needsScalarInduction(Instruction *IV) const; 651 652 /// If there is a cast involved in the induction variable \p ID, which should 653 /// be ignored in the vectorized loop body, this function records the 654 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 655 /// cast. We had already proved that the casted Phi is equal to the uncasted 656 /// Phi in the vectorized loop (under a runtime guard), and therefore 657 /// there is no need to vectorize the cast - the same value can be used in the 658 /// vector loop for both the Phi and the cast. 659 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 660 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 661 /// 662 /// \p EntryVal is the value from the original loop that maps to the vector 663 /// phi node and is used to distinguish what is the IV currently being 664 /// processed - original one (if \p EntryVal is a phi corresponding to the 665 /// original IV) or the "newly-created" one based on the proof mentioned above 666 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 667 /// latter case \p EntryVal is a TruncInst and we must not record anything for 668 /// that IV, but it's error-prone to expect callers of this routine to care 669 /// about that, hence this explicit parameter. 670 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 671 const Instruction *EntryVal, 672 Value *VectorLoopValue, 673 unsigned Part, 674 unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. 696 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 697 698 /// Emit bypass checks to check any memory assumptions we may have made. 699 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 700 701 /// Compute the transformed value of Index at offset StartValue using step 702 /// StepValue. 703 /// For integer induction, returns StartValue + Index * StepValue. 704 /// For pointer induction, returns StartValue[Index * StepValue]. 705 /// FIXME: The newly created binary instructions should contain nsw/nuw 706 /// flags, which can be found from the original scalar operations. 707 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 708 const DataLayout &DL, 709 const InductionDescriptor &ID) const; 710 711 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 712 /// vector loop preheader, middle block and scalar preheader. Also 713 /// allocate a loop object for the new vector loop and return it. 714 Loop *createVectorLoopSkeleton(StringRef Prefix); 715 716 /// Create new phi nodes for the induction variables to resume iteration count 717 /// in the scalar epilogue, from where the vectorized loop left off (given by 718 /// \p VectorTripCount). 719 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 720 721 /// Complete the loop skeleton by adding debug MDs, creating appropriate 722 /// conditional branches in the middle block, preparing the builder and 723 /// running the verifier. Take in the vector loop \p L as argument, and return 724 /// the preheader of the completed vector loop. 725 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 726 727 /// Add additional metadata to \p To that was not present on \p Orig. 728 /// 729 /// Currently this is used to add the noalias annotations based on the 730 /// inserted memchecks. Use this for instructions that are *cloned* into the 731 /// vector loop. 732 void addNewMetadata(Instruction *To, const Instruction *Orig); 733 734 /// Add metadata from one instruction to another. 735 /// 736 /// This includes both the original MDs from \p From and additional ones (\see 737 /// addNewMetadata). Use this for *newly created* instructions in the vector 738 /// loop. 739 void addMetadata(Instruction *To, Instruction *From); 740 741 /// Similar to the previous function but it adds the metadata to a 742 /// vector of instructions. 743 void addMetadata(ArrayRef<Value *> To, Instruction *From); 744 745 /// The original loop. 746 Loop *OrigLoop; 747 748 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 749 /// dynamic knowledge to simplify SCEV expressions and converts them to a 750 /// more usable form. 751 PredicatedScalarEvolution &PSE; 752 753 /// Loop Info. 754 LoopInfo *LI; 755 756 /// Dominator Tree. 757 DominatorTree *DT; 758 759 /// Alias Analysis. 760 AAResults *AA; 761 762 /// Target Library Info. 763 const TargetLibraryInfo *TLI; 764 765 /// Target Transform Info. 766 const TargetTransformInfo *TTI; 767 768 /// Assumption Cache. 769 AssumptionCache *AC; 770 771 /// Interface to emit optimization remarks. 772 OptimizationRemarkEmitter *ORE; 773 774 /// LoopVersioning. It's only set up (non-null) if memchecks were 775 /// used. 776 /// 777 /// This is currently only used to add no-alias metadata based on the 778 /// memchecks. The actually versioning is performed manually. 779 std::unique_ptr<LoopVersioning> LVer; 780 781 /// The vectorization SIMD factor to use. Each vector will have this many 782 /// vector elements. 783 ElementCount VF; 784 785 /// The vectorization unroll factor to use. Each scalar is vectorized to this 786 /// many different vector instructions. 787 unsigned UF; 788 789 /// The builder that we use 790 IRBuilder<> Builder; 791 792 // --- Vectorization state --- 793 794 /// The vector-loop preheader. 795 BasicBlock *LoopVectorPreHeader; 796 797 /// The scalar-loop preheader. 798 BasicBlock *LoopScalarPreHeader; 799 800 /// Middle Block between the vector and the scalar. 801 BasicBlock *LoopMiddleBlock; 802 803 /// The ExitBlock of the scalar loop. 804 BasicBlock *LoopExitBlock; 805 806 /// The vector loop body. 807 BasicBlock *LoopVectorBody; 808 809 /// The scalar loop body. 810 BasicBlock *LoopScalarBody; 811 812 /// A list of all bypass blocks. The first block is the entry of the loop. 813 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 814 815 /// The new Induction variable which was added to the new block. 816 PHINode *Induction = nullptr; 817 818 /// The induction variable of the old basic block. 819 PHINode *OldInduction = nullptr; 820 821 /// Maps values from the original loop to their corresponding values in the 822 /// vectorized loop. A key value can map to either vector values, scalar 823 /// values or both kinds of values, depending on whether the key was 824 /// vectorized and scalarized. 825 VectorizerValueMap VectorLoopValueMap; 826 827 /// Store instructions that were predicated. 828 SmallVector<Instruction *, 4> PredicatedInstructions; 829 830 /// Trip count of the original loop. 831 Value *TripCount = nullptr; 832 833 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 834 Value *VectorTripCount = nullptr; 835 836 /// The legality analysis. 837 LoopVectorizationLegality *Legal; 838 839 /// The profitablity analysis. 840 LoopVectorizationCostModel *Cost; 841 842 // Record whether runtime checks are added. 843 bool AddedSafetyChecks = false; 844 845 // Holds the end values for each induction variable. We save the end values 846 // so we can later fix-up the external users of the induction variables. 847 DenseMap<PHINode *, Value *> IVEndValues; 848 849 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 850 // fixed up at the end of vector code generation. 851 SmallVector<PHINode *, 8> OrigPHIsToFix; 852 853 /// BFI and PSI are used to check for profile guided size optimizations. 854 BlockFrequencyInfo *BFI; 855 ProfileSummaryInfo *PSI; 856 857 // Whether this loop should be optimized for size based on profile guided size 858 // optimizatios. 859 bool OptForSizeBasedOnProfile; 860 }; 861 862 class InnerLoopUnroller : public InnerLoopVectorizer { 863 public: 864 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 865 LoopInfo *LI, DominatorTree *DT, 866 const TargetLibraryInfo *TLI, 867 const TargetTransformInfo *TTI, AssumptionCache *AC, 868 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 869 LoopVectorizationLegality *LVL, 870 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 871 ProfileSummaryInfo *PSI) 872 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 873 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 874 BFI, PSI) {} 875 876 private: 877 Value *getBroadcastInstrs(Value *V) override; 878 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 879 Instruction::BinaryOps Opcode = 880 Instruction::BinaryOpsEnd) override; 881 Value *reverseVector(Value *Vec) override; 882 }; 883 884 } // end namespace llvm 885 886 /// Look for a meaningful debug location on the instruction or it's 887 /// operands. 888 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 889 if (!I) 890 return I; 891 892 DebugLoc Empty; 893 if (I->getDebugLoc() != Empty) 894 return I; 895 896 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 897 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 898 if (OpInst->getDebugLoc() != Empty) 899 return OpInst; 900 } 901 902 return I; 903 } 904 905 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 906 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 907 const DILocation *DIL = Inst->getDebugLoc(); 908 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 909 !isa<DbgInfoIntrinsic>(Inst)) { 910 assert(!VF.isScalable() && "scalable vectors not yet supported."); 911 auto NewDIL = 912 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 913 if (NewDIL) 914 B.SetCurrentDebugLocation(NewDIL.getValue()); 915 else 916 LLVM_DEBUG(dbgs() 917 << "Failed to create new discriminator: " 918 << DIL->getFilename() << " Line: " << DIL->getLine()); 919 } 920 else 921 B.SetCurrentDebugLocation(DIL); 922 } else 923 B.SetCurrentDebugLocation(DebugLoc()); 924 } 925 926 /// Write a record \p DebugMsg about vectorization failure to the debug 927 /// output stream. If \p I is passed, it is an instruction that prevents 928 /// vectorization. 929 #ifndef NDEBUG 930 static void debugVectorizationFailure(const StringRef DebugMsg, 931 Instruction *I) { 932 dbgs() << "LV: Not vectorizing: " << DebugMsg; 933 if (I != nullptr) 934 dbgs() << " " << *I; 935 else 936 dbgs() << '.'; 937 dbgs() << '\n'; 938 } 939 #endif 940 941 /// Create an analysis remark that explains why vectorization failed 942 /// 943 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 944 /// RemarkName is the identifier for the remark. If \p I is passed it is an 945 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 946 /// the location of the remark. \return the remark object that can be 947 /// streamed to. 948 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 949 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 950 Value *CodeRegion = TheLoop->getHeader(); 951 DebugLoc DL = TheLoop->getStartLoc(); 952 953 if (I) { 954 CodeRegion = I->getParent(); 955 // If there is no debug location attached to the instruction, revert back to 956 // using the loop's. 957 if (I->getDebugLoc()) 958 DL = I->getDebugLoc(); 959 } 960 961 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 962 R << "loop not vectorized: "; 963 return R; 964 } 965 966 namespace llvm { 967 968 void reportVectorizationFailure(const StringRef DebugMsg, 969 const StringRef OREMsg, const StringRef ORETag, 970 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 971 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 972 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 973 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 974 ORETag, TheLoop, I) << OREMsg); 975 } 976 977 } // end namespace llvm 978 979 #ifndef NDEBUG 980 /// \return string containing a file name and a line # for the given loop. 981 static std::string getDebugLocString(const Loop *L) { 982 std::string Result; 983 if (L) { 984 raw_string_ostream OS(Result); 985 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 986 LoopDbgLoc.print(OS); 987 else 988 // Just print the module name. 989 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 990 OS.flush(); 991 } 992 return Result; 993 } 994 #endif 995 996 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 997 const Instruction *Orig) { 998 // If the loop was versioned with memchecks, add the corresponding no-alias 999 // metadata. 1000 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1001 LVer->annotateInstWithNoAlias(To, Orig); 1002 } 1003 1004 void InnerLoopVectorizer::addMetadata(Instruction *To, 1005 Instruction *From) { 1006 propagateMetadata(To, From); 1007 addNewMetadata(To, From); 1008 } 1009 1010 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1011 Instruction *From) { 1012 for (Value *V : To) { 1013 if (Instruction *I = dyn_cast<Instruction>(V)) 1014 addMetadata(I, From); 1015 } 1016 } 1017 1018 namespace llvm { 1019 1020 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1021 // lowered. 1022 enum ScalarEpilogueLowering { 1023 1024 // The default: allowing scalar epilogues. 1025 CM_ScalarEpilogueAllowed, 1026 1027 // Vectorization with OptForSize: don't allow epilogues. 1028 CM_ScalarEpilogueNotAllowedOptSize, 1029 1030 // A special case of vectorisation with OptForSize: loops with a very small 1031 // trip count are considered for vectorization under OptForSize, thereby 1032 // making sure the cost of their loop body is dominant, free of runtime 1033 // guards and scalar iteration overheads. 1034 CM_ScalarEpilogueNotAllowedLowTripLoop, 1035 1036 // Loop hint predicate indicating an epilogue is undesired. 1037 CM_ScalarEpilogueNotNeededUsePredicate 1038 }; 1039 1040 /// LoopVectorizationCostModel - estimates the expected speedups due to 1041 /// vectorization. 1042 /// In many cases vectorization is not profitable. This can happen because of 1043 /// a number of reasons. In this class we mainly attempt to predict the 1044 /// expected speedup/slowdowns due to the supported instruction set. We use the 1045 /// TargetTransformInfo to query the different backends for the cost of 1046 /// different operations. 1047 class LoopVectorizationCostModel { 1048 public: 1049 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1050 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1051 LoopVectorizationLegality *Legal, 1052 const TargetTransformInfo &TTI, 1053 const TargetLibraryInfo *TLI, DemandedBits *DB, 1054 AssumptionCache *AC, 1055 OptimizationRemarkEmitter *ORE, const Function *F, 1056 const LoopVectorizeHints *Hints, 1057 InterleavedAccessInfo &IAI) 1058 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1059 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1060 Hints(Hints), InterleaveInfo(IAI) {} 1061 1062 /// \return An upper bound for the vectorization factor, or None if 1063 /// vectorization and interleaving should be avoided up front. 1064 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1065 1066 /// \return True if runtime checks are required for vectorization, and false 1067 /// otherwise. 1068 bool runtimeChecksRequired(); 1069 1070 /// \return The most profitable vectorization factor and the cost of that VF. 1071 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1072 /// then this vectorization factor will be selected if vectorization is 1073 /// possible. 1074 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1075 1076 /// Setup cost-based decisions for user vectorization factor. 1077 void selectUserVectorizationFactor(ElementCount UserVF) { 1078 collectUniformsAndScalars(UserVF); 1079 collectInstsToScalarize(UserVF); 1080 } 1081 1082 /// \return The size (in bits) of the smallest and widest types in the code 1083 /// that needs to be vectorized. We ignore values that remain scalar such as 1084 /// 64 bit loop indices. 1085 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1086 1087 /// \return The desired interleave count. 1088 /// If interleave count has been specified by metadata it will be returned. 1089 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1090 /// are the selected vectorization factor and the cost of the selected VF. 1091 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1092 1093 /// Memory access instruction may be vectorized in more than one way. 1094 /// Form of instruction after vectorization depends on cost. 1095 /// This function takes cost-based decisions for Load/Store instructions 1096 /// and collects them in a map. This decisions map is used for building 1097 /// the lists of loop-uniform and loop-scalar instructions. 1098 /// The calculated cost is saved with widening decision in order to 1099 /// avoid redundant calculations. 1100 void setCostBasedWideningDecision(ElementCount VF); 1101 1102 /// A struct that represents some properties of the register usage 1103 /// of a loop. 1104 struct RegisterUsage { 1105 /// Holds the number of loop invariant values that are used in the loop. 1106 /// The key is ClassID of target-provided register class. 1107 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1108 /// Holds the maximum number of concurrent live intervals in the loop. 1109 /// The key is ClassID of target-provided register class. 1110 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1111 }; 1112 1113 /// \return Returns information about the register usages of the loop for the 1114 /// given vectorization factors. 1115 SmallVector<RegisterUsage, 8> 1116 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1117 1118 /// Collect values we want to ignore in the cost model. 1119 void collectValuesToIgnore(); 1120 1121 /// Split reductions into those that happen in the loop, and those that happen 1122 /// outside. In loop reductions are collected into InLoopReductionChains. 1123 void collectInLoopReductions(); 1124 1125 /// \returns The smallest bitwidth each instruction can be represented with. 1126 /// The vector equivalents of these instructions should be truncated to this 1127 /// type. 1128 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1129 return MinBWs; 1130 } 1131 1132 /// \returns True if it is more profitable to scalarize instruction \p I for 1133 /// vectorization factor \p VF. 1134 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1135 assert(VF.isVector() && 1136 "Profitable to scalarize relevant only for VF > 1."); 1137 1138 // Cost model is not run in the VPlan-native path - return conservative 1139 // result until this changes. 1140 if (EnableVPlanNativePath) 1141 return false; 1142 1143 auto Scalars = InstsToScalarize.find(VF); 1144 assert(Scalars != InstsToScalarize.end() && 1145 "VF not yet analyzed for scalarization profitability"); 1146 return Scalars->second.find(I) != Scalars->second.end(); 1147 } 1148 1149 /// Returns true if \p I is known to be uniform after vectorization. 1150 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1151 if (VF.isScalar()) 1152 return true; 1153 1154 // Cost model is not run in the VPlan-native path - return conservative 1155 // result until this changes. 1156 if (EnableVPlanNativePath) 1157 return false; 1158 1159 auto UniformsPerVF = Uniforms.find(VF); 1160 assert(UniformsPerVF != Uniforms.end() && 1161 "VF not yet analyzed for uniformity"); 1162 return UniformsPerVF->second.count(I); 1163 } 1164 1165 /// Returns true if \p I is known to be scalar after vectorization. 1166 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1167 if (VF.isScalar()) 1168 return true; 1169 1170 // Cost model is not run in the VPlan-native path - return conservative 1171 // result until this changes. 1172 if (EnableVPlanNativePath) 1173 return false; 1174 1175 auto ScalarsPerVF = Scalars.find(VF); 1176 assert(ScalarsPerVF != Scalars.end() && 1177 "Scalar values are not calculated for VF"); 1178 return ScalarsPerVF->second.count(I); 1179 } 1180 1181 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1182 /// for vectorization factor \p VF. 1183 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1184 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1185 !isProfitableToScalarize(I, VF) && 1186 !isScalarAfterVectorization(I, VF); 1187 } 1188 1189 /// Decision that was taken during cost calculation for memory instruction. 1190 enum InstWidening { 1191 CM_Unknown, 1192 CM_Widen, // For consecutive accesses with stride +1. 1193 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1194 CM_Interleave, 1195 CM_GatherScatter, 1196 CM_Scalarize 1197 }; 1198 1199 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1200 /// instruction \p I and vector width \p VF. 1201 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1202 unsigned Cost) { 1203 assert(VF.isVector() && "Expected VF >=2"); 1204 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1205 } 1206 1207 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1208 /// interleaving group \p Grp and vector width \p VF. 1209 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1210 ElementCount VF, InstWidening W, unsigned Cost) { 1211 assert(VF.isVector() && "Expected VF >=2"); 1212 /// Broadcast this decicion to all instructions inside the group. 1213 /// But the cost will be assigned to one instruction only. 1214 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1215 if (auto *I = Grp->getMember(i)) { 1216 if (Grp->getInsertPos() == I) 1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1218 else 1219 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1220 } 1221 } 1222 } 1223 1224 /// Return the cost model decision for the given instruction \p I and vector 1225 /// width \p VF. Return CM_Unknown if this instruction did not pass 1226 /// through the cost modeling. 1227 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1228 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1229 assert(VF.isVector() && "Expected VF >=2"); 1230 1231 // Cost model is not run in the VPlan-native path - return conservative 1232 // result until this changes. 1233 if (EnableVPlanNativePath) 1234 return CM_GatherScatter; 1235 1236 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1237 auto Itr = WideningDecisions.find(InstOnVF); 1238 if (Itr == WideningDecisions.end()) 1239 return CM_Unknown; 1240 return Itr->second.first; 1241 } 1242 1243 /// Return the vectorization cost for the given instruction \p I and vector 1244 /// width \p VF. 1245 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1246 assert(VF.isVector() && "Expected VF >=2"); 1247 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1248 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1249 "The cost is not calculated"); 1250 return WideningDecisions[InstOnVF].second; 1251 } 1252 1253 /// Return True if instruction \p I is an optimizable truncate whose operand 1254 /// is an induction variable. Such a truncate will be removed by adding a new 1255 /// induction variable with the destination type. 1256 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1257 // If the instruction is not a truncate, return false. 1258 auto *Trunc = dyn_cast<TruncInst>(I); 1259 if (!Trunc) 1260 return false; 1261 1262 // Get the source and destination types of the truncate. 1263 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1264 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1265 1266 // If the truncate is free for the given types, return false. Replacing a 1267 // free truncate with an induction variable would add an induction variable 1268 // update instruction to each iteration of the loop. We exclude from this 1269 // check the primary induction variable since it will need an update 1270 // instruction regardless. 1271 Value *Op = Trunc->getOperand(0); 1272 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1273 return false; 1274 1275 // If the truncated value is not an induction variable, return false. 1276 return Legal->isInductionPhi(Op); 1277 } 1278 1279 /// Collects the instructions to scalarize for each predicated instruction in 1280 /// the loop. 1281 void collectInstsToScalarize(ElementCount VF); 1282 1283 /// Collect Uniform and Scalar values for the given \p VF. 1284 /// The sets depend on CM decision for Load/Store instructions 1285 /// that may be vectorized as interleave, gather-scatter or scalarized. 1286 void collectUniformsAndScalars(ElementCount VF) { 1287 // Do the analysis once. 1288 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1289 return; 1290 setCostBasedWideningDecision(VF); 1291 collectLoopUniforms(VF); 1292 collectLoopScalars(VF); 1293 } 1294 1295 /// Returns true if the target machine supports masked store operation 1296 /// for the given \p DataType and kind of access to \p Ptr. 1297 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1298 return Legal->isConsecutivePtr(Ptr) && 1299 TTI.isLegalMaskedStore(DataType, Alignment); 1300 } 1301 1302 /// Returns true if the target machine supports masked load operation 1303 /// for the given \p DataType and kind of access to \p Ptr. 1304 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1305 return Legal->isConsecutivePtr(Ptr) && 1306 TTI.isLegalMaskedLoad(DataType, Alignment); 1307 } 1308 1309 /// Returns true if the target machine supports masked scatter operation 1310 /// for the given \p DataType. 1311 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1312 return TTI.isLegalMaskedScatter(DataType, Alignment); 1313 } 1314 1315 /// Returns true if the target machine supports masked gather operation 1316 /// for the given \p DataType. 1317 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1318 return TTI.isLegalMaskedGather(DataType, Alignment); 1319 } 1320 1321 /// Returns true if the target machine can represent \p V as a masked gather 1322 /// or scatter operation. 1323 bool isLegalGatherOrScatter(Value *V) { 1324 bool LI = isa<LoadInst>(V); 1325 bool SI = isa<StoreInst>(V); 1326 if (!LI && !SI) 1327 return false; 1328 auto *Ty = getMemInstValueType(V); 1329 Align Align = getLoadStoreAlignment(V); 1330 return (LI && isLegalMaskedGather(Ty, Align)) || 1331 (SI && isLegalMaskedScatter(Ty, Align)); 1332 } 1333 1334 /// Returns true if \p I is an instruction that will be scalarized with 1335 /// predication. Such instructions include conditional stores and 1336 /// instructions that may divide by zero. 1337 /// If a non-zero VF has been calculated, we check if I will be scalarized 1338 /// predication for that VF. 1339 bool isScalarWithPredication(Instruction *I, 1340 ElementCount VF = ElementCount::getFixed(1)); 1341 1342 // Returns true if \p I is an instruction that will be predicated either 1343 // through scalar predication or masked load/store or masked gather/scatter. 1344 // Superset of instructions that return true for isScalarWithPredication. 1345 bool isPredicatedInst(Instruction *I) { 1346 if (!blockNeedsPredication(I->getParent())) 1347 return false; 1348 // Loads and stores that need some form of masked operation are predicated 1349 // instructions. 1350 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1351 return Legal->isMaskRequired(I); 1352 return isScalarWithPredication(I); 1353 } 1354 1355 /// Returns true if \p I is a memory instruction with consecutive memory 1356 /// access that can be widened. 1357 bool 1358 memoryInstructionCanBeWidened(Instruction *I, 1359 ElementCount VF = ElementCount::getFixed(1)); 1360 1361 /// Returns true if \p I is a memory instruction in an interleaved-group 1362 /// of memory accesses that can be vectorized with wide vector loads/stores 1363 /// and shuffles. 1364 bool 1365 interleavedAccessCanBeWidened(Instruction *I, 1366 ElementCount VF = ElementCount::getFixed(1)); 1367 1368 /// Check if \p Instr belongs to any interleaved access group. 1369 bool isAccessInterleaved(Instruction *Instr) { 1370 return InterleaveInfo.isInterleaved(Instr); 1371 } 1372 1373 /// Get the interleaved access group that \p Instr belongs to. 1374 const InterleaveGroup<Instruction> * 1375 getInterleavedAccessGroup(Instruction *Instr) { 1376 return InterleaveInfo.getInterleaveGroup(Instr); 1377 } 1378 1379 /// Returns true if an interleaved group requires a scalar iteration 1380 /// to handle accesses with gaps, and there is nothing preventing us from 1381 /// creating a scalar epilogue. 1382 bool requiresScalarEpilogue() const { 1383 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1384 } 1385 1386 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1387 /// loop hint annotation. 1388 bool isScalarEpilogueAllowed() const { 1389 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1390 } 1391 1392 /// Returns true if all loop blocks should be masked to fold tail loop. 1393 bool foldTailByMasking() const { return FoldTailByMasking; } 1394 1395 bool blockNeedsPredication(BasicBlock *BB) { 1396 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1397 } 1398 1399 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1400 /// nodes to the chain of instructions representing the reductions. Uses a 1401 /// MapVector to ensure deterministic iteration order. 1402 using ReductionChainMap = 1403 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1404 1405 /// Return the chain of instructions representing an inloop reduction. 1406 const ReductionChainMap &getInLoopReductionChains() const { 1407 return InLoopReductionChains; 1408 } 1409 1410 /// Returns true if the Phi is part of an inloop reduction. 1411 bool isInLoopReduction(PHINode *Phi) const { 1412 return InLoopReductionChains.count(Phi); 1413 } 1414 1415 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1416 /// with factor VF. Return the cost of the instruction, including 1417 /// scalarization overhead if it's needed. 1418 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1419 1420 /// Estimate cost of a call instruction CI if it were vectorized with factor 1421 /// VF. Return the cost of the instruction, including scalarization overhead 1422 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1423 /// scalarized - 1424 /// i.e. either vector version isn't available, or is too expensive. 1425 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1426 bool &NeedToScalarize); 1427 1428 /// Invalidates decisions already taken by the cost model. 1429 void invalidateCostModelingDecisions() { 1430 WideningDecisions.clear(); 1431 Uniforms.clear(); 1432 Scalars.clear(); 1433 } 1434 1435 private: 1436 unsigned NumPredStores = 0; 1437 1438 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1439 /// than zero. One is returned if vectorization should best be avoided due 1440 /// to cost. 1441 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1442 1443 /// The vectorization cost is a combination of the cost itself and a boolean 1444 /// indicating whether any of the contributing operations will actually 1445 /// operate on 1446 /// vector values after type legalization in the backend. If this latter value 1447 /// is 1448 /// false, then all operations will be scalarized (i.e. no vectorization has 1449 /// actually taken place). 1450 using VectorizationCostTy = std::pair<unsigned, bool>; 1451 1452 /// Returns the expected execution cost. The unit of the cost does 1453 /// not matter because we use the 'cost' units to compare different 1454 /// vector widths. The cost that is returned is *not* normalized by 1455 /// the factor width. 1456 VectorizationCostTy expectedCost(ElementCount VF); 1457 1458 /// Returns the execution time cost of an instruction for a given vector 1459 /// width. Vector width of one means scalar. 1460 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1461 1462 /// The cost-computation logic from getInstructionCost which provides 1463 /// the vector type as an output parameter. 1464 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1465 1466 /// Calculate vectorization cost of memory instruction \p I. 1467 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1468 1469 /// The cost computation for scalarized memory instruction. 1470 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1471 1472 /// The cost computation for interleaving group of memory instructions. 1473 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1474 1475 /// The cost computation for Gather/Scatter instruction. 1476 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1477 1478 /// The cost computation for widening instruction \p I with consecutive 1479 /// memory access. 1480 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1481 1482 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1483 /// Load: scalar load + broadcast. 1484 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1485 /// element) 1486 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1487 1488 /// Estimate the overhead of scalarizing an instruction. This is a 1489 /// convenience wrapper for the type-based getScalarizationOverhead API. 1490 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1491 1492 /// Returns whether the instruction is a load or store and will be a emitted 1493 /// as a vector operation. 1494 bool isConsecutiveLoadOrStore(Instruction *I); 1495 1496 /// Returns true if an artificially high cost for emulated masked memrefs 1497 /// should be used. 1498 bool useEmulatedMaskMemRefHack(Instruction *I); 1499 1500 /// Map of scalar integer values to the smallest bitwidth they can be legally 1501 /// represented as. The vector equivalents of these values should be truncated 1502 /// to this type. 1503 MapVector<Instruction *, uint64_t> MinBWs; 1504 1505 /// A type representing the costs for instructions if they were to be 1506 /// scalarized rather than vectorized. The entries are Instruction-Cost 1507 /// pairs. 1508 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1509 1510 /// A set containing all BasicBlocks that are known to present after 1511 /// vectorization as a predicated block. 1512 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1513 1514 /// Records whether it is allowed to have the original scalar loop execute at 1515 /// least once. This may be needed as a fallback loop in case runtime 1516 /// aliasing/dependence checks fail, or to handle the tail/remainder 1517 /// iterations when the trip count is unknown or doesn't divide by the VF, 1518 /// or as a peel-loop to handle gaps in interleave-groups. 1519 /// Under optsize and when the trip count is very small we don't allow any 1520 /// iterations to execute in the scalar loop. 1521 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1522 1523 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1524 bool FoldTailByMasking = false; 1525 1526 /// A map holding scalar costs for different vectorization factors. The 1527 /// presence of a cost for an instruction in the mapping indicates that the 1528 /// instruction will be scalarized when vectorizing with the associated 1529 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1530 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1531 1532 /// Holds the instructions known to be uniform after vectorization. 1533 /// The data is collected per VF. 1534 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1535 1536 /// Holds the instructions known to be scalar after vectorization. 1537 /// The data is collected per VF. 1538 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1539 1540 /// Holds the instructions (address computations) that are forced to be 1541 /// scalarized. 1542 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1543 1544 /// PHINodes of the reductions that should be expanded in-loop along with 1545 /// their associated chains of reduction operations, in program order from top 1546 /// (PHI) to bottom 1547 ReductionChainMap InLoopReductionChains; 1548 1549 /// Returns the expected difference in cost from scalarizing the expression 1550 /// feeding a predicated instruction \p PredInst. The instructions to 1551 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1552 /// non-negative return value implies the expression will be scalarized. 1553 /// Currently, only single-use chains are considered for scalarization. 1554 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1555 ElementCount VF); 1556 1557 /// Collect the instructions that are uniform after vectorization. An 1558 /// instruction is uniform if we represent it with a single scalar value in 1559 /// the vectorized loop corresponding to each vector iteration. Examples of 1560 /// uniform instructions include pointer operands of consecutive or 1561 /// interleaved memory accesses. Note that although uniformity implies an 1562 /// instruction will be scalar, the reverse is not true. In general, a 1563 /// scalarized instruction will be represented by VF scalar values in the 1564 /// vectorized loop, each corresponding to an iteration of the original 1565 /// scalar loop. 1566 void collectLoopUniforms(ElementCount VF); 1567 1568 /// Collect the instructions that are scalar after vectorization. An 1569 /// instruction is scalar if it is known to be uniform or will be scalarized 1570 /// during vectorization. Non-uniform scalarized instructions will be 1571 /// represented by VF values in the vectorized loop, each corresponding to an 1572 /// iteration of the original scalar loop. 1573 void collectLoopScalars(ElementCount VF); 1574 1575 /// Keeps cost model vectorization decision and cost for instructions. 1576 /// Right now it is used for memory instructions only. 1577 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1578 std::pair<InstWidening, unsigned>>; 1579 1580 DecisionList WideningDecisions; 1581 1582 /// Returns true if \p V is expected to be vectorized and it needs to be 1583 /// extracted. 1584 bool needsExtract(Value *V, ElementCount VF) const { 1585 Instruction *I = dyn_cast<Instruction>(V); 1586 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1587 TheLoop->isLoopInvariant(I)) 1588 return false; 1589 1590 // Assume we can vectorize V (and hence we need extraction) if the 1591 // scalars are not computed yet. This can happen, because it is called 1592 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1593 // the scalars are collected. That should be a safe assumption in most 1594 // cases, because we check if the operands have vectorizable types 1595 // beforehand in LoopVectorizationLegality. 1596 return Scalars.find(VF) == Scalars.end() || 1597 !isScalarAfterVectorization(I, VF); 1598 }; 1599 1600 /// Returns a range containing only operands needing to be extracted. 1601 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1602 ElementCount VF) { 1603 return SmallVector<Value *, 4>(make_filter_range( 1604 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1605 } 1606 1607 public: 1608 /// The loop that we evaluate. 1609 Loop *TheLoop; 1610 1611 /// Predicated scalar evolution analysis. 1612 PredicatedScalarEvolution &PSE; 1613 1614 /// Loop Info analysis. 1615 LoopInfo *LI; 1616 1617 /// Vectorization legality. 1618 LoopVectorizationLegality *Legal; 1619 1620 /// Vector target information. 1621 const TargetTransformInfo &TTI; 1622 1623 /// Target Library Info. 1624 const TargetLibraryInfo *TLI; 1625 1626 /// Demanded bits analysis. 1627 DemandedBits *DB; 1628 1629 /// Assumption cache. 1630 AssumptionCache *AC; 1631 1632 /// Interface to emit optimization remarks. 1633 OptimizationRemarkEmitter *ORE; 1634 1635 const Function *TheFunction; 1636 1637 /// Loop Vectorize Hint. 1638 const LoopVectorizeHints *Hints; 1639 1640 /// The interleave access information contains groups of interleaved accesses 1641 /// with the same stride and close to each other. 1642 InterleavedAccessInfo &InterleaveInfo; 1643 1644 /// Values to ignore in the cost model. 1645 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1646 1647 /// Values to ignore in the cost model when VF > 1. 1648 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1649 }; 1650 1651 } // end namespace llvm 1652 1653 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1654 // vectorization. The loop needs to be annotated with #pragma omp simd 1655 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1656 // vector length information is not provided, vectorization is not considered 1657 // explicit. Interleave hints are not allowed either. These limitations will be 1658 // relaxed in the future. 1659 // Please, note that we are currently forced to abuse the pragma 'clang 1660 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1661 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1662 // provides *explicit vectorization hints* (LV can bypass legal checks and 1663 // assume that vectorization is legal). However, both hints are implemented 1664 // using the same metadata (llvm.loop.vectorize, processed by 1665 // LoopVectorizeHints). This will be fixed in the future when the native IR 1666 // representation for pragma 'omp simd' is introduced. 1667 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1668 OptimizationRemarkEmitter *ORE) { 1669 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1670 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1671 1672 // Only outer loops with an explicit vectorization hint are supported. 1673 // Unannotated outer loops are ignored. 1674 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1675 return false; 1676 1677 Function *Fn = OuterLp->getHeader()->getParent(); 1678 if (!Hints.allowVectorization(Fn, OuterLp, 1679 true /*VectorizeOnlyWhenForced*/)) { 1680 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1681 return false; 1682 } 1683 1684 if (Hints.getInterleave() > 1) { 1685 // TODO: Interleave support is future work. 1686 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1687 "outer loops.\n"); 1688 Hints.emitRemarkWithHints(); 1689 return false; 1690 } 1691 1692 return true; 1693 } 1694 1695 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1696 OptimizationRemarkEmitter *ORE, 1697 SmallVectorImpl<Loop *> &V) { 1698 // Collect inner loops and outer loops without irreducible control flow. For 1699 // now, only collect outer loops that have explicit vectorization hints. If we 1700 // are stress testing the VPlan H-CFG construction, we collect the outermost 1701 // loop of every loop nest. 1702 if (L.isInnermost() || VPlanBuildStressTest || 1703 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1704 LoopBlocksRPO RPOT(&L); 1705 RPOT.perform(LI); 1706 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1707 V.push_back(&L); 1708 // TODO: Collect inner loops inside marked outer loops in case 1709 // vectorization fails for the outer loop. Do not invoke 1710 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1711 // already known to be reducible. We can use an inherited attribute for 1712 // that. 1713 return; 1714 } 1715 } 1716 for (Loop *InnerL : L) 1717 collectSupportedLoops(*InnerL, LI, ORE, V); 1718 } 1719 1720 namespace { 1721 1722 /// The LoopVectorize Pass. 1723 struct LoopVectorize : public FunctionPass { 1724 /// Pass identification, replacement for typeid 1725 static char ID; 1726 1727 LoopVectorizePass Impl; 1728 1729 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1730 bool VectorizeOnlyWhenForced = false) 1731 : FunctionPass(ID), 1732 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1733 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1734 } 1735 1736 bool runOnFunction(Function &F) override { 1737 if (skipFunction(F)) 1738 return false; 1739 1740 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1741 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1742 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1743 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1744 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1745 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1746 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1747 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1748 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1749 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1750 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1751 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1752 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1753 1754 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1755 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1756 1757 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1758 GetLAA, *ORE, PSI).MadeAnyChange; 1759 } 1760 1761 void getAnalysisUsage(AnalysisUsage &AU) const override { 1762 AU.addRequired<AssumptionCacheTracker>(); 1763 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1764 AU.addRequired<DominatorTreeWrapperPass>(); 1765 AU.addRequired<LoopInfoWrapperPass>(); 1766 AU.addRequired<ScalarEvolutionWrapperPass>(); 1767 AU.addRequired<TargetTransformInfoWrapperPass>(); 1768 AU.addRequired<AAResultsWrapperPass>(); 1769 AU.addRequired<LoopAccessLegacyAnalysis>(); 1770 AU.addRequired<DemandedBitsWrapperPass>(); 1771 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1772 AU.addRequired<InjectTLIMappingsLegacy>(); 1773 1774 // We currently do not preserve loopinfo/dominator analyses with outer loop 1775 // vectorization. Until this is addressed, mark these analyses as preserved 1776 // only for non-VPlan-native path. 1777 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1778 if (!EnableVPlanNativePath) { 1779 AU.addPreserved<LoopInfoWrapperPass>(); 1780 AU.addPreserved<DominatorTreeWrapperPass>(); 1781 } 1782 1783 AU.addPreserved<BasicAAWrapperPass>(); 1784 AU.addPreserved<GlobalsAAWrapperPass>(); 1785 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1786 } 1787 }; 1788 1789 } // end anonymous namespace 1790 1791 //===----------------------------------------------------------------------===// 1792 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1793 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1794 //===----------------------------------------------------------------------===// 1795 1796 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1797 // We need to place the broadcast of invariant variables outside the loop, 1798 // but only if it's proven safe to do so. Else, broadcast will be inside 1799 // vector loop body. 1800 Instruction *Instr = dyn_cast<Instruction>(V); 1801 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1802 (!Instr || 1803 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1804 // Place the code for broadcasting invariant variables in the new preheader. 1805 IRBuilder<>::InsertPointGuard Guard(Builder); 1806 if (SafeToHoist) 1807 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1808 1809 // Broadcast the scalar into all locations in the vector. 1810 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1811 1812 return Shuf; 1813 } 1814 1815 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1816 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1817 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1818 "Expected either an induction phi-node or a truncate of it!"); 1819 Value *Start = II.getStartValue(); 1820 1821 // Construct the initial value of the vector IV in the vector loop preheader 1822 auto CurrIP = Builder.saveIP(); 1823 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1824 if (isa<TruncInst>(EntryVal)) { 1825 assert(Start->getType()->isIntegerTy() && 1826 "Truncation requires an integer type"); 1827 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1828 Step = Builder.CreateTrunc(Step, TruncType); 1829 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1830 } 1831 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1832 Value *SteppedStart = 1833 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1834 1835 // We create vector phi nodes for both integer and floating-point induction 1836 // variables. Here, we determine the kind of arithmetic we will perform. 1837 Instruction::BinaryOps AddOp; 1838 Instruction::BinaryOps MulOp; 1839 if (Step->getType()->isIntegerTy()) { 1840 AddOp = Instruction::Add; 1841 MulOp = Instruction::Mul; 1842 } else { 1843 AddOp = II.getInductionOpcode(); 1844 MulOp = Instruction::FMul; 1845 } 1846 1847 // Multiply the vectorization factor by the step using integer or 1848 // floating-point arithmetic as appropriate. 1849 Value *ConstVF = 1850 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1851 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1852 1853 // Create a vector splat to use in the induction update. 1854 // 1855 // FIXME: If the step is non-constant, we create the vector splat with 1856 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1857 // handle a constant vector splat. 1858 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1859 Value *SplatVF = isa<Constant>(Mul) 1860 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1861 : Builder.CreateVectorSplat(VF, Mul); 1862 Builder.restoreIP(CurrIP); 1863 1864 // We may need to add the step a number of times, depending on the unroll 1865 // factor. The last of those goes into the PHI. 1866 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1867 &*LoopVectorBody->getFirstInsertionPt()); 1868 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1869 Instruction *LastInduction = VecInd; 1870 for (unsigned Part = 0; Part < UF; ++Part) { 1871 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1872 1873 if (isa<TruncInst>(EntryVal)) 1874 addMetadata(LastInduction, EntryVal); 1875 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1876 1877 LastInduction = cast<Instruction>(addFastMathFlag( 1878 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1879 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1880 } 1881 1882 // Move the last step to the end of the latch block. This ensures consistent 1883 // placement of all induction updates. 1884 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1885 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1886 auto *ICmp = cast<Instruction>(Br->getCondition()); 1887 LastInduction->moveBefore(ICmp); 1888 LastInduction->setName("vec.ind.next"); 1889 1890 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1891 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1892 } 1893 1894 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1895 return Cost->isScalarAfterVectorization(I, VF) || 1896 Cost->isProfitableToScalarize(I, VF); 1897 } 1898 1899 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1900 if (shouldScalarizeInstruction(IV)) 1901 return true; 1902 auto isScalarInst = [&](User *U) -> bool { 1903 auto *I = cast<Instruction>(U); 1904 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1905 }; 1906 return llvm::any_of(IV->users(), isScalarInst); 1907 } 1908 1909 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1910 const InductionDescriptor &ID, const Instruction *EntryVal, 1911 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1912 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1913 "Expected either an induction phi-node or a truncate of it!"); 1914 1915 // This induction variable is not the phi from the original loop but the 1916 // newly-created IV based on the proof that casted Phi is equal to the 1917 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1918 // re-uses the same InductionDescriptor that original IV uses but we don't 1919 // have to do any recording in this case - that is done when original IV is 1920 // processed. 1921 if (isa<TruncInst>(EntryVal)) 1922 return; 1923 1924 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1925 if (Casts.empty()) 1926 return; 1927 // Only the first Cast instruction in the Casts vector is of interest. 1928 // The rest of the Casts (if exist) have no uses outside the 1929 // induction update chain itself. 1930 Instruction *CastInst = *Casts.begin(); 1931 if (Lane < UINT_MAX) 1932 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1933 else 1934 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1935 } 1936 1937 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1938 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1939 "Primary induction variable must have an integer type"); 1940 1941 auto II = Legal->getInductionVars().find(IV); 1942 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1943 1944 auto ID = II->second; 1945 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1946 1947 // The value from the original loop to which we are mapping the new induction 1948 // variable. 1949 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1950 1951 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1952 1953 // Generate code for the induction step. Note that induction steps are 1954 // required to be loop-invariant 1955 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1956 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1957 "Induction step should be loop invariant"); 1958 if (PSE.getSE()->isSCEVable(IV->getType())) { 1959 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1960 return Exp.expandCodeFor(Step, Step->getType(), 1961 LoopVectorPreHeader->getTerminator()); 1962 } 1963 return cast<SCEVUnknown>(Step)->getValue(); 1964 }; 1965 1966 // The scalar value to broadcast. This is derived from the canonical 1967 // induction variable. If a truncation type is given, truncate the canonical 1968 // induction variable and step. Otherwise, derive these values from the 1969 // induction descriptor. 1970 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1971 Value *ScalarIV = Induction; 1972 if (IV != OldInduction) { 1973 ScalarIV = IV->getType()->isIntegerTy() 1974 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1975 : Builder.CreateCast(Instruction::SIToFP, Induction, 1976 IV->getType()); 1977 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1978 ScalarIV->setName("offset.idx"); 1979 } 1980 if (Trunc) { 1981 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1982 assert(Step->getType()->isIntegerTy() && 1983 "Truncation requires an integer step"); 1984 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1985 Step = Builder.CreateTrunc(Step, TruncType); 1986 } 1987 return ScalarIV; 1988 }; 1989 1990 // Create the vector values from the scalar IV, in the absence of creating a 1991 // vector IV. 1992 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1993 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1994 for (unsigned Part = 0; Part < UF; ++Part) { 1995 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1996 Value *EntryPart = 1997 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1998 ID.getInductionOpcode()); 1999 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2000 if (Trunc) 2001 addMetadata(EntryPart, Trunc); 2002 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2003 } 2004 }; 2005 2006 // Now do the actual transformations, and start with creating the step value. 2007 Value *Step = CreateStepValue(ID.getStep()); 2008 if (VF.isZero() || VF.isScalar()) { 2009 Value *ScalarIV = CreateScalarIV(Step); 2010 CreateSplatIV(ScalarIV, Step); 2011 return; 2012 } 2013 2014 // Determine if we want a scalar version of the induction variable. This is 2015 // true if the induction variable itself is not widened, or if it has at 2016 // least one user in the loop that is not widened. 2017 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2018 if (!NeedsScalarIV) { 2019 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2020 return; 2021 } 2022 2023 // Try to create a new independent vector induction variable. If we can't 2024 // create the phi node, we will splat the scalar induction variable in each 2025 // loop iteration. 2026 if (!shouldScalarizeInstruction(EntryVal)) { 2027 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2028 Value *ScalarIV = CreateScalarIV(Step); 2029 // Create scalar steps that can be used by instructions we will later 2030 // scalarize. Note that the addition of the scalar steps will not increase 2031 // the number of instructions in the loop in the common case prior to 2032 // InstCombine. We will be trading one vector extract for each scalar step. 2033 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2034 return; 2035 } 2036 2037 // All IV users are scalar instructions, so only emit a scalar IV, not a 2038 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2039 // predicate used by the masked loads/stores. 2040 Value *ScalarIV = CreateScalarIV(Step); 2041 if (!Cost->isScalarEpilogueAllowed()) 2042 CreateSplatIV(ScalarIV, Step); 2043 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2044 } 2045 2046 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2047 Instruction::BinaryOps BinOp) { 2048 // Create and check the types. 2049 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2050 int VLen = ValVTy->getNumElements(); 2051 2052 Type *STy = Val->getType()->getScalarType(); 2053 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2054 "Induction Step must be an integer or FP"); 2055 assert(Step->getType() == STy && "Step has wrong type"); 2056 2057 SmallVector<Constant *, 8> Indices; 2058 2059 if (STy->isIntegerTy()) { 2060 // Create a vector of consecutive numbers from zero to VF. 2061 for (int i = 0; i < VLen; ++i) 2062 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2063 2064 // Add the consecutive indices to the vector value. 2065 Constant *Cv = ConstantVector::get(Indices); 2066 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2067 Step = Builder.CreateVectorSplat(VLen, Step); 2068 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2069 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2070 // which can be found from the original scalar operations. 2071 Step = Builder.CreateMul(Cv, Step); 2072 return Builder.CreateAdd(Val, Step, "induction"); 2073 } 2074 2075 // Floating point induction. 2076 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2077 "Binary Opcode should be specified for FP induction"); 2078 // Create a vector of consecutive numbers from zero to VF. 2079 for (int i = 0; i < VLen; ++i) 2080 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2081 2082 // Add the consecutive indices to the vector value. 2083 Constant *Cv = ConstantVector::get(Indices); 2084 2085 Step = Builder.CreateVectorSplat(VLen, Step); 2086 2087 // Floating point operations had to be 'fast' to enable the induction. 2088 FastMathFlags Flags; 2089 Flags.setFast(); 2090 2091 Value *MulOp = Builder.CreateFMul(Cv, Step); 2092 if (isa<Instruction>(MulOp)) 2093 // Have to check, MulOp may be a constant 2094 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2095 2096 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2097 if (isa<Instruction>(BOp)) 2098 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2099 return BOp; 2100 } 2101 2102 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2103 Instruction *EntryVal, 2104 const InductionDescriptor &ID) { 2105 // We shouldn't have to build scalar steps if we aren't vectorizing. 2106 assert(VF.isVector() && "VF should be greater than one"); 2107 assert(!VF.isScalable() && 2108 "the code below assumes a fixed number of elements at compile time"); 2109 // Get the value type and ensure it and the step have the same integer type. 2110 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2111 assert(ScalarIVTy == Step->getType() && 2112 "Val and Step should have the same type"); 2113 2114 // We build scalar steps for both integer and floating-point induction 2115 // variables. Here, we determine the kind of arithmetic we will perform. 2116 Instruction::BinaryOps AddOp; 2117 Instruction::BinaryOps MulOp; 2118 if (ScalarIVTy->isIntegerTy()) { 2119 AddOp = Instruction::Add; 2120 MulOp = Instruction::Mul; 2121 } else { 2122 AddOp = ID.getInductionOpcode(); 2123 MulOp = Instruction::FMul; 2124 } 2125 2126 // Determine the number of scalars we need to generate for each unroll 2127 // iteration. If EntryVal is uniform, we only need to generate the first 2128 // lane. Otherwise, we generate all VF values. 2129 unsigned Lanes = 2130 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2131 ? 1 2132 : VF.getKnownMinValue(); 2133 // Compute the scalar steps and save the results in VectorLoopValueMap. 2134 for (unsigned Part = 0; Part < UF; ++Part) { 2135 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2136 auto *StartIdx = getSignedIntOrFpConstant( 2137 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2138 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2139 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2140 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2141 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2142 } 2143 } 2144 } 2145 2146 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2147 assert(V != Induction && "The new induction variable should not be used."); 2148 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2149 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2150 2151 // If we have a stride that is replaced by one, do it here. Defer this for 2152 // the VPlan-native path until we start running Legal checks in that path. 2153 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2154 V = ConstantInt::get(V->getType(), 1); 2155 2156 // If we have a vector mapped to this value, return it. 2157 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2158 return VectorLoopValueMap.getVectorValue(V, Part); 2159 2160 // If the value has not been vectorized, check if it has been scalarized 2161 // instead. If it has been scalarized, and we actually need the value in 2162 // vector form, we will construct the vector values on demand. 2163 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2164 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2165 2166 // If we've scalarized a value, that value should be an instruction. 2167 auto *I = cast<Instruction>(V); 2168 2169 // If we aren't vectorizing, we can just copy the scalar map values over to 2170 // the vector map. 2171 if (VF.isScalar()) { 2172 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2173 return ScalarValue; 2174 } 2175 2176 // Get the last scalar instruction we generated for V and Part. If the value 2177 // is known to be uniform after vectorization, this corresponds to lane zero 2178 // of the Part unroll iteration. Otherwise, the last instruction is the one 2179 // we created for the last vector lane of the Part unroll iteration. 2180 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2181 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2182 ? 0 2183 : VF.getKnownMinValue() - 1; 2184 auto *LastInst = cast<Instruction>( 2185 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2186 2187 // Set the insert point after the last scalarized instruction. This ensures 2188 // the insertelement sequence will directly follow the scalar definitions. 2189 auto OldIP = Builder.saveIP(); 2190 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2191 Builder.SetInsertPoint(&*NewIP); 2192 2193 // However, if we are vectorizing, we need to construct the vector values. 2194 // If the value is known to be uniform after vectorization, we can just 2195 // broadcast the scalar value corresponding to lane zero for each unroll 2196 // iteration. Otherwise, we construct the vector values using insertelement 2197 // instructions. Since the resulting vectors are stored in 2198 // VectorLoopValueMap, we will only generate the insertelements once. 2199 Value *VectorValue = nullptr; 2200 if (Cost->isUniformAfterVectorization(I, VF)) { 2201 VectorValue = getBroadcastInstrs(ScalarValue); 2202 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2203 } else { 2204 // Initialize packing with insertelements to start from undef. 2205 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2206 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2207 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2208 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2209 packScalarIntoVectorValue(V, {Part, Lane}); 2210 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2211 } 2212 Builder.restoreIP(OldIP); 2213 return VectorValue; 2214 } 2215 2216 // If this scalar is unknown, assume that it is a constant or that it is 2217 // loop invariant. Broadcast V and save the value for future uses. 2218 Value *B = getBroadcastInstrs(V); 2219 VectorLoopValueMap.setVectorValue(V, Part, B); 2220 return B; 2221 } 2222 2223 Value * 2224 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2225 const VPIteration &Instance) { 2226 // If the value is not an instruction contained in the loop, it should 2227 // already be scalar. 2228 if (OrigLoop->isLoopInvariant(V)) 2229 return V; 2230 2231 assert(Instance.Lane > 0 2232 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2233 : true && "Uniform values only have lane zero"); 2234 2235 // If the value from the original loop has not been vectorized, it is 2236 // represented by UF x VF scalar values in the new loop. Return the requested 2237 // scalar value. 2238 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2239 return VectorLoopValueMap.getScalarValue(V, Instance); 2240 2241 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2242 // for the given unroll part. If this entry is not a vector type (i.e., the 2243 // vectorization factor is one), there is no need to generate an 2244 // extractelement instruction. 2245 auto *U = getOrCreateVectorValue(V, Instance.Part); 2246 if (!U->getType()->isVectorTy()) { 2247 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2248 return U; 2249 } 2250 2251 // Otherwise, the value from the original loop has been vectorized and is 2252 // represented by UF vector values. Extract and return the requested scalar 2253 // value from the appropriate vector lane. 2254 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2255 } 2256 2257 void InnerLoopVectorizer::packScalarIntoVectorValue( 2258 Value *V, const VPIteration &Instance) { 2259 assert(V != Induction && "The new induction variable should not be used."); 2260 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2261 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2262 2263 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2264 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2265 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2266 Builder.getInt32(Instance.Lane)); 2267 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2268 } 2269 2270 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2271 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2272 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2273 SmallVector<int, 8> ShuffleMask; 2274 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2275 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2276 2277 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2278 } 2279 2280 // Return whether we allow using masked interleave-groups (for dealing with 2281 // strided loads/stores that reside in predicated blocks, or for dealing 2282 // with gaps). 2283 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2284 // If an override option has been passed in for interleaved accesses, use it. 2285 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2286 return EnableMaskedInterleavedMemAccesses; 2287 2288 return TTI.enableMaskedInterleavedAccessVectorization(); 2289 } 2290 2291 // Try to vectorize the interleave group that \p Instr belongs to. 2292 // 2293 // E.g. Translate following interleaved load group (factor = 3): 2294 // for (i = 0; i < N; i+=3) { 2295 // R = Pic[i]; // Member of index 0 2296 // G = Pic[i+1]; // Member of index 1 2297 // B = Pic[i+2]; // Member of index 2 2298 // ... // do something to R, G, B 2299 // } 2300 // To: 2301 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2302 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2303 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2304 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2305 // 2306 // Or translate following interleaved store group (factor = 3): 2307 // for (i = 0; i < N; i+=3) { 2308 // ... do something to R, G, B 2309 // Pic[i] = R; // Member of index 0 2310 // Pic[i+1] = G; // Member of index 1 2311 // Pic[i+2] = B; // Member of index 2 2312 // } 2313 // To: 2314 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2315 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2316 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2317 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2318 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2319 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2320 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2321 VPValue *Addr, VPValue *BlockInMask) { 2322 Instruction *Instr = Group->getInsertPos(); 2323 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2324 2325 // Prepare for the vector type of the interleaved load/store. 2326 Type *ScalarTy = getMemInstValueType(Instr); 2327 unsigned InterleaveFactor = Group->getFactor(); 2328 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2329 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2330 2331 // Prepare for the new pointers. 2332 SmallVector<Value *, 2> AddrParts; 2333 unsigned Index = Group->getIndex(Instr); 2334 2335 // TODO: extend the masked interleaved-group support to reversed access. 2336 assert((!BlockInMask || !Group->isReverse()) && 2337 "Reversed masked interleave-group not supported."); 2338 2339 // If the group is reverse, adjust the index to refer to the last vector lane 2340 // instead of the first. We adjust the index from the first vector lane, 2341 // rather than directly getting the pointer for lane VF - 1, because the 2342 // pointer operand of the interleaved access is supposed to be uniform. For 2343 // uniform instructions, we're only required to generate a value for the 2344 // first vector lane in each unroll iteration. 2345 assert(!VF.isScalable() && 2346 "scalable vector reverse operation is not implemented"); 2347 if (Group->isReverse()) 2348 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2349 2350 for (unsigned Part = 0; Part < UF; Part++) { 2351 Value *AddrPart = State.get(Addr, {Part, 0}); 2352 setDebugLocFromInst(Builder, AddrPart); 2353 2354 // Notice current instruction could be any index. Need to adjust the address 2355 // to the member of index 0. 2356 // 2357 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2358 // b = A[i]; // Member of index 0 2359 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2360 // 2361 // E.g. A[i+1] = a; // Member of index 1 2362 // A[i] = b; // Member of index 0 2363 // A[i+2] = c; // Member of index 2 (Current instruction) 2364 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2365 2366 bool InBounds = false; 2367 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2368 InBounds = gep->isInBounds(); 2369 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2370 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2371 2372 // Cast to the vector pointer type. 2373 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2374 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2375 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2376 } 2377 2378 setDebugLocFromInst(Builder, Instr); 2379 Value *UndefVec = UndefValue::get(VecTy); 2380 2381 Value *MaskForGaps = nullptr; 2382 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2383 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2384 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2385 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2386 } 2387 2388 // Vectorize the interleaved load group. 2389 if (isa<LoadInst>(Instr)) { 2390 // For each unroll part, create a wide load for the group. 2391 SmallVector<Value *, 2> NewLoads; 2392 for (unsigned Part = 0; Part < UF; Part++) { 2393 Instruction *NewLoad; 2394 if (BlockInMask || MaskForGaps) { 2395 assert(useMaskedInterleavedAccesses(*TTI) && 2396 "masked interleaved groups are not allowed."); 2397 Value *GroupMask = MaskForGaps; 2398 if (BlockInMask) { 2399 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2400 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2401 Value *ShuffledMask = Builder.CreateShuffleVector( 2402 BlockInMaskPart, 2403 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2404 "interleaved.mask"); 2405 GroupMask = MaskForGaps 2406 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2407 MaskForGaps) 2408 : ShuffledMask; 2409 } 2410 NewLoad = 2411 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2412 GroupMask, UndefVec, "wide.masked.vec"); 2413 } 2414 else 2415 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2416 Group->getAlign(), "wide.vec"); 2417 Group->addMetadata(NewLoad); 2418 NewLoads.push_back(NewLoad); 2419 } 2420 2421 // For each member in the group, shuffle out the appropriate data from the 2422 // wide loads. 2423 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2424 Instruction *Member = Group->getMember(I); 2425 2426 // Skip the gaps in the group. 2427 if (!Member) 2428 continue; 2429 2430 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2431 auto StrideMask = 2432 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2433 for (unsigned Part = 0; Part < UF; Part++) { 2434 Value *StridedVec = Builder.CreateShuffleVector( 2435 NewLoads[Part], StrideMask, "strided.vec"); 2436 2437 // If this member has different type, cast the result type. 2438 if (Member->getType() != ScalarTy) { 2439 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2440 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2441 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2442 } 2443 2444 if (Group->isReverse()) 2445 StridedVec = reverseVector(StridedVec); 2446 2447 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2448 } 2449 } 2450 return; 2451 } 2452 2453 // The sub vector type for current instruction. 2454 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2455 auto *SubVT = VectorType::get(ScalarTy, VF); 2456 2457 // Vectorize the interleaved store group. 2458 for (unsigned Part = 0; Part < UF; Part++) { 2459 // Collect the stored vector from each member. 2460 SmallVector<Value *, 4> StoredVecs; 2461 for (unsigned i = 0; i < InterleaveFactor; i++) { 2462 // Interleaved store group doesn't allow a gap, so each index has a member 2463 Instruction *Member = Group->getMember(i); 2464 assert(Member && "Fail to get a member from an interleaved store group"); 2465 2466 Value *StoredVec = getOrCreateVectorValue( 2467 cast<StoreInst>(Member)->getValueOperand(), Part); 2468 if (Group->isReverse()) 2469 StoredVec = reverseVector(StoredVec); 2470 2471 // If this member has different type, cast it to a unified type. 2472 2473 if (StoredVec->getType() != SubVT) 2474 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2475 2476 StoredVecs.push_back(StoredVec); 2477 } 2478 2479 // Concatenate all vectors into a wide vector. 2480 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2481 2482 // Interleave the elements in the wide vector. 2483 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2484 Value *IVec = Builder.CreateShuffleVector( 2485 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2486 "interleaved.vec"); 2487 2488 Instruction *NewStoreInstr; 2489 if (BlockInMask) { 2490 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2491 Value *ShuffledMask = Builder.CreateShuffleVector( 2492 BlockInMaskPart, 2493 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2494 "interleaved.mask"); 2495 NewStoreInstr = Builder.CreateMaskedStore( 2496 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2497 } 2498 else 2499 NewStoreInstr = 2500 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2501 2502 Group->addMetadata(NewStoreInstr); 2503 } 2504 } 2505 2506 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2507 VPTransformState &State, 2508 VPValue *Addr, 2509 VPValue *StoredValue, 2510 VPValue *BlockInMask) { 2511 // Attempt to issue a wide load. 2512 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2513 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2514 2515 assert((LI || SI) && "Invalid Load/Store instruction"); 2516 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2517 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2518 2519 LoopVectorizationCostModel::InstWidening Decision = 2520 Cost->getWideningDecision(Instr, VF); 2521 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2522 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2523 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2524 "CM decision is not to widen the memory instruction"); 2525 2526 Type *ScalarDataTy = getMemInstValueType(Instr); 2527 2528 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2529 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2530 const Align Alignment = getLoadStoreAlignment(Instr); 2531 2532 // Determine if the pointer operand of the access is either consecutive or 2533 // reverse consecutive. 2534 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2535 bool ConsecutiveStride = 2536 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2537 bool CreateGatherScatter = 2538 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2539 2540 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2541 // gather/scatter. Otherwise Decision should have been to Scalarize. 2542 assert((ConsecutiveStride || CreateGatherScatter) && 2543 "The instruction should be scalarized"); 2544 (void)ConsecutiveStride; 2545 2546 VectorParts BlockInMaskParts(UF); 2547 bool isMaskRequired = BlockInMask; 2548 if (isMaskRequired) 2549 for (unsigned Part = 0; Part < UF; ++Part) 2550 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2551 2552 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2553 // Calculate the pointer for the specific unroll-part. 2554 GetElementPtrInst *PartPtr = nullptr; 2555 2556 bool InBounds = false; 2557 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2558 InBounds = gep->isInBounds(); 2559 2560 if (Reverse) { 2561 // If the address is consecutive but reversed, then the 2562 // wide store needs to start at the last vector element. 2563 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2564 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2565 PartPtr->setIsInBounds(InBounds); 2566 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2567 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2568 PartPtr->setIsInBounds(InBounds); 2569 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2570 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2571 } else { 2572 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2573 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2574 PartPtr->setIsInBounds(InBounds); 2575 } 2576 2577 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2578 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2579 }; 2580 2581 // Handle Stores: 2582 if (SI) { 2583 setDebugLocFromInst(Builder, SI); 2584 2585 for (unsigned Part = 0; Part < UF; ++Part) { 2586 Instruction *NewSI = nullptr; 2587 Value *StoredVal = State.get(StoredValue, Part); 2588 if (CreateGatherScatter) { 2589 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2590 Value *VectorGep = State.get(Addr, Part); 2591 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2592 MaskPart); 2593 } else { 2594 if (Reverse) { 2595 // If we store to reverse consecutive memory locations, then we need 2596 // to reverse the order of elements in the stored value. 2597 StoredVal = reverseVector(StoredVal); 2598 // We don't want to update the value in the map as it might be used in 2599 // another expression. So don't call resetVectorValue(StoredVal). 2600 } 2601 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2602 if (isMaskRequired) 2603 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2604 BlockInMaskParts[Part]); 2605 else 2606 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2607 } 2608 addMetadata(NewSI, SI); 2609 } 2610 return; 2611 } 2612 2613 // Handle loads. 2614 assert(LI && "Must have a load instruction"); 2615 setDebugLocFromInst(Builder, LI); 2616 for (unsigned Part = 0; Part < UF; ++Part) { 2617 Value *NewLI; 2618 if (CreateGatherScatter) { 2619 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2620 Value *VectorGep = State.get(Addr, Part); 2621 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2622 nullptr, "wide.masked.gather"); 2623 addMetadata(NewLI, LI); 2624 } else { 2625 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2626 if (isMaskRequired) 2627 NewLI = Builder.CreateMaskedLoad( 2628 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2629 "wide.masked.load"); 2630 else 2631 NewLI = 2632 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2633 2634 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2635 addMetadata(NewLI, LI); 2636 if (Reverse) 2637 NewLI = reverseVector(NewLI); 2638 } 2639 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2640 } 2641 } 2642 2643 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2644 const VPIteration &Instance, 2645 bool IfPredicateInstr, 2646 VPTransformState &State) { 2647 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2648 2649 setDebugLocFromInst(Builder, Instr); 2650 2651 // Does this instruction return a value ? 2652 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2653 2654 Instruction *Cloned = Instr->clone(); 2655 if (!IsVoidRetTy) 2656 Cloned->setName(Instr->getName() + ".cloned"); 2657 2658 // Replace the operands of the cloned instructions with their scalar 2659 // equivalents in the new loop. 2660 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2661 auto *NewOp = State.get(User.getOperand(op), Instance); 2662 Cloned->setOperand(op, NewOp); 2663 } 2664 addNewMetadata(Cloned, Instr); 2665 2666 // Place the cloned scalar in the new loop. 2667 Builder.Insert(Cloned); 2668 2669 // Add the cloned scalar to the scalar map entry. 2670 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2671 2672 // If we just cloned a new assumption, add it the assumption cache. 2673 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2674 if (II->getIntrinsicID() == Intrinsic::assume) 2675 AC->registerAssumption(II); 2676 2677 // End if-block. 2678 if (IfPredicateInstr) 2679 PredicatedInstructions.push_back(Cloned); 2680 } 2681 2682 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2683 Value *End, Value *Step, 2684 Instruction *DL) { 2685 BasicBlock *Header = L->getHeader(); 2686 BasicBlock *Latch = L->getLoopLatch(); 2687 // As we're just creating this loop, it's possible no latch exists 2688 // yet. If so, use the header as this will be a single block loop. 2689 if (!Latch) 2690 Latch = Header; 2691 2692 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2693 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2694 setDebugLocFromInst(Builder, OldInst); 2695 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2696 2697 Builder.SetInsertPoint(Latch->getTerminator()); 2698 setDebugLocFromInst(Builder, OldInst); 2699 2700 // Create i+1 and fill the PHINode. 2701 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2702 Induction->addIncoming(Start, L->getLoopPreheader()); 2703 Induction->addIncoming(Next, Latch); 2704 // Create the compare. 2705 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2706 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2707 2708 // Now we have two terminators. Remove the old one from the block. 2709 Latch->getTerminator()->eraseFromParent(); 2710 2711 return Induction; 2712 } 2713 2714 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2715 if (TripCount) 2716 return TripCount; 2717 2718 assert(L && "Create Trip Count for null loop."); 2719 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2720 // Find the loop boundaries. 2721 ScalarEvolution *SE = PSE.getSE(); 2722 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2723 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2724 "Invalid loop count"); 2725 2726 Type *IdxTy = Legal->getWidestInductionType(); 2727 assert(IdxTy && "No type for induction"); 2728 2729 // The exit count might have the type of i64 while the phi is i32. This can 2730 // happen if we have an induction variable that is sign extended before the 2731 // compare. The only way that we get a backedge taken count is that the 2732 // induction variable was signed and as such will not overflow. In such a case 2733 // truncation is legal. 2734 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2735 IdxTy->getPrimitiveSizeInBits()) 2736 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2737 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2738 2739 // Get the total trip count from the count by adding 1. 2740 const SCEV *ExitCount = SE->getAddExpr( 2741 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2742 2743 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2744 2745 // Expand the trip count and place the new instructions in the preheader. 2746 // Notice that the pre-header does not change, only the loop body. 2747 SCEVExpander Exp(*SE, DL, "induction"); 2748 2749 // Count holds the overall loop count (N). 2750 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2751 L->getLoopPreheader()->getTerminator()); 2752 2753 if (TripCount->getType()->isPointerTy()) 2754 TripCount = 2755 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2756 L->getLoopPreheader()->getTerminator()); 2757 2758 return TripCount; 2759 } 2760 2761 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2762 if (VectorTripCount) 2763 return VectorTripCount; 2764 2765 Value *TC = getOrCreateTripCount(L); 2766 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2767 2768 Type *Ty = TC->getType(); 2769 // This is where we can make the step a runtime constant. 2770 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2771 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2772 2773 // If the tail is to be folded by masking, round the number of iterations N 2774 // up to a multiple of Step instead of rounding down. This is done by first 2775 // adding Step-1 and then rounding down. Note that it's ok if this addition 2776 // overflows: the vector induction variable will eventually wrap to zero given 2777 // that it starts at zero and its Step is a power of two; the loop will then 2778 // exit, with the last early-exit vector comparison also producing all-true. 2779 if (Cost->foldTailByMasking()) { 2780 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2781 "VF*UF must be a power of 2 when folding tail by masking"); 2782 TC = Builder.CreateAdd( 2783 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2784 } 2785 2786 // Now we need to generate the expression for the part of the loop that the 2787 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2788 // iterations are not required for correctness, or N - Step, otherwise. Step 2789 // is equal to the vectorization factor (number of SIMD elements) times the 2790 // unroll factor (number of SIMD instructions). 2791 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2792 2793 // If there is a non-reversed interleaved group that may speculatively access 2794 // memory out-of-bounds, we need to ensure that there will be at least one 2795 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2796 // the trip count, we set the remainder to be equal to the step. If the step 2797 // does not evenly divide the trip count, no adjustment is necessary since 2798 // there will already be scalar iterations. Note that the minimum iterations 2799 // check ensures that N >= Step. 2800 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2801 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2802 R = Builder.CreateSelect(IsZero, Step, R); 2803 } 2804 2805 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2806 2807 return VectorTripCount; 2808 } 2809 2810 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2811 const DataLayout &DL) { 2812 // Verify that V is a vector type with same number of elements as DstVTy. 2813 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2814 unsigned VF = DstFVTy->getNumElements(); 2815 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2816 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2817 Type *SrcElemTy = SrcVecTy->getElementType(); 2818 Type *DstElemTy = DstFVTy->getElementType(); 2819 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2820 "Vector elements must have same size"); 2821 2822 // Do a direct cast if element types are castable. 2823 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2824 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2825 } 2826 // V cannot be directly casted to desired vector type. 2827 // May happen when V is a floating point vector but DstVTy is a vector of 2828 // pointers or vice-versa. Handle this using a two-step bitcast using an 2829 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2830 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2831 "Only one type should be a pointer type"); 2832 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2833 "Only one type should be a floating point type"); 2834 Type *IntTy = 2835 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2836 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2837 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2838 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2839 } 2840 2841 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2842 BasicBlock *Bypass) { 2843 Value *Count = getOrCreateTripCount(L); 2844 // Reuse existing vector loop preheader for TC checks. 2845 // Note that new preheader block is generated for vector loop. 2846 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2847 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2848 2849 // Generate code to check if the loop's trip count is less than VF * UF, or 2850 // equal to it in case a scalar epilogue is required; this implies that the 2851 // vector trip count is zero. This check also covers the case where adding one 2852 // to the backedge-taken count overflowed leading to an incorrect trip count 2853 // of zero. In this case we will also jump to the scalar loop. 2854 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2855 : ICmpInst::ICMP_ULT; 2856 2857 // If tail is to be folded, vector loop takes care of all iterations. 2858 Value *CheckMinIters = Builder.getFalse(); 2859 if (!Cost->foldTailByMasking()) { 2860 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2861 CheckMinIters = Builder.CreateICmp( 2862 P, Count, 2863 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2864 "min.iters.check"); 2865 } 2866 // Create new preheader for vector loop. 2867 LoopVectorPreHeader = 2868 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2869 "vector.ph"); 2870 2871 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2872 DT->getNode(Bypass)->getIDom()) && 2873 "TC check is expected to dominate Bypass"); 2874 2875 // Update dominator for Bypass & LoopExit. 2876 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2877 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2878 2879 ReplaceInstWithInst( 2880 TCCheckBlock->getTerminator(), 2881 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2882 LoopBypassBlocks.push_back(TCCheckBlock); 2883 } 2884 2885 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2886 // Reuse existing vector loop preheader for SCEV checks. 2887 // Note that new preheader block is generated for vector loop. 2888 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2889 2890 // Generate the code to check that the SCEV assumptions that we made. 2891 // We want the new basic block to start at the first instruction in a 2892 // sequence of instructions that form a check. 2893 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2894 "scev.check"); 2895 Value *SCEVCheck = Exp.expandCodeForPredicate( 2896 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2897 2898 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2899 if (C->isZero()) 2900 return; 2901 2902 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2903 (OptForSizeBasedOnProfile && 2904 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2905 "Cannot SCEV check stride or overflow when optimizing for size"); 2906 2907 SCEVCheckBlock->setName("vector.scevcheck"); 2908 // Create new preheader for vector loop. 2909 LoopVectorPreHeader = 2910 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2911 nullptr, "vector.ph"); 2912 2913 // Update dominator only if this is first RT check. 2914 if (LoopBypassBlocks.empty()) { 2915 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2916 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2917 } 2918 2919 ReplaceInstWithInst( 2920 SCEVCheckBlock->getTerminator(), 2921 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2922 LoopBypassBlocks.push_back(SCEVCheckBlock); 2923 AddedSafetyChecks = true; 2924 } 2925 2926 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2927 // VPlan-native path does not do any analysis for runtime checks currently. 2928 if (EnableVPlanNativePath) 2929 return; 2930 2931 // Reuse existing vector loop preheader for runtime memory checks. 2932 // Note that new preheader block is generated for vector loop. 2933 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2934 2935 // Generate the code that checks in runtime if arrays overlap. We put the 2936 // checks into a separate block to make the more common case of few elements 2937 // faster. 2938 auto *LAI = Legal->getLAI(); 2939 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2940 if (!RtPtrChecking.Need) 2941 return; 2942 2943 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2944 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2945 "Cannot emit memory checks when optimizing for size, unless forced " 2946 "to vectorize."); 2947 ORE->emit([&]() { 2948 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2949 L->getStartLoc(), L->getHeader()) 2950 << "Code-size may be reduced by not forcing " 2951 "vectorization, or by source-code modifications " 2952 "eliminating the need for runtime checks " 2953 "(e.g., adding 'restrict')."; 2954 }); 2955 } 2956 2957 MemCheckBlock->setName("vector.memcheck"); 2958 // Create new preheader for vector loop. 2959 LoopVectorPreHeader = 2960 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2961 "vector.ph"); 2962 2963 auto *CondBranch = cast<BranchInst>( 2964 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 2965 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 2966 LoopBypassBlocks.push_back(MemCheckBlock); 2967 AddedSafetyChecks = true; 2968 2969 // Update dominator only if this is first RT check. 2970 if (LoopBypassBlocks.empty()) { 2971 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2972 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2973 } 2974 2975 Instruction *FirstCheckInst; 2976 Instruction *MemRuntimeCheck; 2977 std::tie(FirstCheckInst, MemRuntimeCheck) = 2978 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2979 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2980 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2981 "claimed checks are required"); 2982 CondBranch->setCondition(MemRuntimeCheck); 2983 2984 // We currently don't use LoopVersioning for the actual loop cloning but we 2985 // still use it to add the noalias metadata. 2986 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2987 PSE.getSE()); 2988 LVer->prepareNoAliasMetadata(); 2989 } 2990 2991 Value *InnerLoopVectorizer::emitTransformedIndex( 2992 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2993 const InductionDescriptor &ID) const { 2994 2995 SCEVExpander Exp(*SE, DL, "induction"); 2996 auto Step = ID.getStep(); 2997 auto StartValue = ID.getStartValue(); 2998 assert(Index->getType() == Step->getType() && 2999 "Index type does not match StepValue type"); 3000 3001 // Note: the IR at this point is broken. We cannot use SE to create any new 3002 // SCEV and then expand it, hoping that SCEV's simplification will give us 3003 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3004 // lead to various SCEV crashes. So all we can do is to use builder and rely 3005 // on InstCombine for future simplifications. Here we handle some trivial 3006 // cases only. 3007 auto CreateAdd = [&B](Value *X, Value *Y) { 3008 assert(X->getType() == Y->getType() && "Types don't match!"); 3009 if (auto *CX = dyn_cast<ConstantInt>(X)) 3010 if (CX->isZero()) 3011 return Y; 3012 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3013 if (CY->isZero()) 3014 return X; 3015 return B.CreateAdd(X, Y); 3016 }; 3017 3018 auto CreateMul = [&B](Value *X, Value *Y) { 3019 assert(X->getType() == Y->getType() && "Types don't match!"); 3020 if (auto *CX = dyn_cast<ConstantInt>(X)) 3021 if (CX->isOne()) 3022 return Y; 3023 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3024 if (CY->isOne()) 3025 return X; 3026 return B.CreateMul(X, Y); 3027 }; 3028 3029 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3030 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3031 // the DomTree is not kept up-to-date for additional blocks generated in the 3032 // vector loop. By using the header as insertion point, we guarantee that the 3033 // expanded instructions dominate all their uses. 3034 auto GetInsertPoint = [this, &B]() { 3035 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3036 if (InsertBB != LoopVectorBody && 3037 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3038 return LoopVectorBody->getTerminator(); 3039 return &*B.GetInsertPoint(); 3040 }; 3041 switch (ID.getKind()) { 3042 case InductionDescriptor::IK_IntInduction: { 3043 assert(Index->getType() == StartValue->getType() && 3044 "Index type does not match StartValue type"); 3045 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3046 return B.CreateSub(StartValue, Index); 3047 auto *Offset = CreateMul( 3048 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3049 return CreateAdd(StartValue, Offset); 3050 } 3051 case InductionDescriptor::IK_PtrInduction: { 3052 assert(isa<SCEVConstant>(Step) && 3053 "Expected constant step for pointer induction"); 3054 return B.CreateGEP( 3055 StartValue->getType()->getPointerElementType(), StartValue, 3056 CreateMul(Index, 3057 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3058 } 3059 case InductionDescriptor::IK_FpInduction: { 3060 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3061 auto InductionBinOp = ID.getInductionBinOp(); 3062 assert(InductionBinOp && 3063 (InductionBinOp->getOpcode() == Instruction::FAdd || 3064 InductionBinOp->getOpcode() == Instruction::FSub) && 3065 "Original bin op should be defined for FP induction"); 3066 3067 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3068 3069 // Floating point operations had to be 'fast' to enable the induction. 3070 FastMathFlags Flags; 3071 Flags.setFast(); 3072 3073 Value *MulExp = B.CreateFMul(StepValue, Index); 3074 if (isa<Instruction>(MulExp)) 3075 // We have to check, the MulExp may be a constant. 3076 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3077 3078 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3079 "induction"); 3080 if (isa<Instruction>(BOp)) 3081 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3082 3083 return BOp; 3084 } 3085 case InductionDescriptor::IK_NoInduction: 3086 return nullptr; 3087 } 3088 llvm_unreachable("invalid enum"); 3089 } 3090 3091 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3092 LoopScalarBody = OrigLoop->getHeader(); 3093 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3094 LoopExitBlock = OrigLoop->getExitBlock(); 3095 assert(LoopExitBlock && "Must have an exit block"); 3096 assert(LoopVectorPreHeader && "Invalid loop structure"); 3097 3098 LoopMiddleBlock = 3099 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3100 LI, nullptr, Twine(Prefix) + "middle.block"); 3101 LoopScalarPreHeader = 3102 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3103 nullptr, Twine(Prefix) + "scalar.ph"); 3104 // We intentionally don't let SplitBlock to update LoopInfo since 3105 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3106 // LoopVectorBody is explicitly added to the correct place few lines later. 3107 LoopVectorBody = 3108 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3109 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3110 3111 // Update dominator for loop exit. 3112 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3113 3114 // Create and register the new vector loop. 3115 Loop *Lp = LI->AllocateLoop(); 3116 Loop *ParentLoop = OrigLoop->getParentLoop(); 3117 3118 // Insert the new loop into the loop nest and register the new basic blocks 3119 // before calling any utilities such as SCEV that require valid LoopInfo. 3120 if (ParentLoop) { 3121 ParentLoop->addChildLoop(Lp); 3122 } else { 3123 LI->addTopLevelLoop(Lp); 3124 } 3125 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3126 return Lp; 3127 } 3128 3129 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3130 Value *VectorTripCount) { 3131 assert(VectorTripCount && L && "Expected valid arguments"); 3132 // We are going to resume the execution of the scalar loop. 3133 // Go over all of the induction variables that we found and fix the 3134 // PHIs that are left in the scalar version of the loop. 3135 // The starting values of PHI nodes depend on the counter of the last 3136 // iteration in the vectorized loop. 3137 // If we come from a bypass edge then we need to start from the original 3138 // start value. 3139 for (auto &InductionEntry : Legal->getInductionVars()) { 3140 PHINode *OrigPhi = InductionEntry.first; 3141 InductionDescriptor II = InductionEntry.second; 3142 3143 // Create phi nodes to merge from the backedge-taken check block. 3144 PHINode *BCResumeVal = 3145 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3146 LoopScalarPreHeader->getTerminator()); 3147 // Copy original phi DL over to the new one. 3148 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3149 Value *&EndValue = IVEndValues[OrigPhi]; 3150 if (OrigPhi == OldInduction) { 3151 // We know what the end value is. 3152 EndValue = VectorTripCount; 3153 } else { 3154 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3155 Type *StepType = II.getStep()->getType(); 3156 Instruction::CastOps CastOp = 3157 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3158 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3159 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3160 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3161 EndValue->setName("ind.end"); 3162 } 3163 3164 // The new PHI merges the original incoming value, in case of a bypass, 3165 // or the value at the end of the vectorized loop. 3166 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3167 3168 // Fix the scalar body counter (PHI node). 3169 // The old induction's phi node in the scalar body needs the truncated 3170 // value. 3171 for (BasicBlock *BB : LoopBypassBlocks) 3172 BCResumeVal->addIncoming(II.getStartValue(), BB); 3173 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3174 } 3175 } 3176 3177 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3178 MDNode *OrigLoopID) { 3179 assert(L && "Expected valid loop."); 3180 3181 // The trip counts should be cached by now. 3182 Value *Count = getOrCreateTripCount(L); 3183 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3184 3185 // We need the OrigLoop (scalar loop part) latch terminator to help 3186 // produce correct debug info for the middle block BB instructions. 3187 // The legality check stage guarantees that the loop will have a single 3188 // latch. 3189 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3190 "Scalar loop latch terminator isn't a branch"); 3191 BranchInst *ScalarLatchBr = 3192 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3193 3194 // Add a check in the middle block to see if we have completed 3195 // all of the iterations in the first vector loop. 3196 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3197 // If tail is to be folded, we know we don't need to run the remainder. 3198 Value *CmpN = Builder.getTrue(); 3199 if (!Cost->foldTailByMasking()) { 3200 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3201 VectorTripCount, "cmp.n", 3202 LoopMiddleBlock->getTerminator()); 3203 3204 // Here we use the same DebugLoc as the scalar loop latch branch instead 3205 // of the corresponding compare because they may have ended up with 3206 // different line numbers and we want to avoid awkward line stepping while 3207 // debugging. Eg. if the compare has got a line number inside the loop. 3208 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3209 } 3210 3211 BranchInst *BrInst = 3212 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3213 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3214 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3215 3216 // Get ready to start creating new instructions into the vectorized body. 3217 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3218 "Inconsistent vector loop preheader"); 3219 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3220 3221 Optional<MDNode *> VectorizedLoopID = 3222 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3223 LLVMLoopVectorizeFollowupVectorized}); 3224 if (VectorizedLoopID.hasValue()) { 3225 L->setLoopID(VectorizedLoopID.getValue()); 3226 3227 // Do not setAlreadyVectorized if loop attributes have been defined 3228 // explicitly. 3229 return LoopVectorPreHeader; 3230 } 3231 3232 // Keep all loop hints from the original loop on the vector loop (we'll 3233 // replace the vectorizer-specific hints below). 3234 if (MDNode *LID = OrigLoop->getLoopID()) 3235 L->setLoopID(LID); 3236 3237 LoopVectorizeHints Hints(L, true, *ORE); 3238 Hints.setAlreadyVectorized(); 3239 3240 #ifdef EXPENSIVE_CHECKS 3241 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3242 LI->verify(*DT); 3243 #endif 3244 3245 return LoopVectorPreHeader; 3246 } 3247 3248 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3249 /* 3250 In this function we generate a new loop. The new loop will contain 3251 the vectorized instructions while the old loop will continue to run the 3252 scalar remainder. 3253 3254 [ ] <-- loop iteration number check. 3255 / | 3256 / v 3257 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3258 | / | 3259 | / v 3260 || [ ] <-- vector pre header. 3261 |/ | 3262 | v 3263 | [ ] \ 3264 | [ ]_| <-- vector loop. 3265 | | 3266 | v 3267 | -[ ] <--- middle-block. 3268 | / | 3269 | / v 3270 -|- >[ ] <--- new preheader. 3271 | | 3272 | v 3273 | [ ] \ 3274 | [ ]_| <-- old scalar loop to handle remainder. 3275 \ | 3276 \ v 3277 >[ ] <-- exit block. 3278 ... 3279 */ 3280 3281 // Get the metadata of the original loop before it gets modified. 3282 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3283 3284 // Create an empty vector loop, and prepare basic blocks for the runtime 3285 // checks. 3286 Loop *Lp = createVectorLoopSkeleton(""); 3287 3288 // Now, compare the new count to zero. If it is zero skip the vector loop and 3289 // jump to the scalar loop. This check also covers the case where the 3290 // backedge-taken count is uint##_max: adding one to it will overflow leading 3291 // to an incorrect trip count of zero. In this (rare) case we will also jump 3292 // to the scalar loop. 3293 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3294 3295 // Generate the code to check any assumptions that we've made for SCEV 3296 // expressions. 3297 emitSCEVChecks(Lp, LoopScalarPreHeader); 3298 3299 // Generate the code that checks in runtime if arrays overlap. We put the 3300 // checks into a separate block to make the more common case of few elements 3301 // faster. 3302 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3303 3304 // Some loops have a single integer induction variable, while other loops 3305 // don't. One example is c++ iterators that often have multiple pointer 3306 // induction variables. In the code below we also support a case where we 3307 // don't have a single induction variable. 3308 // 3309 // We try to obtain an induction variable from the original loop as hard 3310 // as possible. However if we don't find one that: 3311 // - is an integer 3312 // - counts from zero, stepping by one 3313 // - is the size of the widest induction variable type 3314 // then we create a new one. 3315 OldInduction = Legal->getPrimaryInduction(); 3316 Type *IdxTy = Legal->getWidestInductionType(); 3317 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3318 // The loop step is equal to the vectorization factor (num of SIMD elements) 3319 // times the unroll factor (num of SIMD instructions). 3320 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3321 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3322 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3323 Induction = 3324 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3325 getDebugLocFromInstOrOperands(OldInduction)); 3326 3327 // Emit phis for the new starting index of the scalar loop. 3328 createInductionResumeValues(Lp, CountRoundDown); 3329 3330 return completeLoopSkeleton(Lp, OrigLoopID); 3331 } 3332 3333 // Fix up external users of the induction variable. At this point, we are 3334 // in LCSSA form, with all external PHIs that use the IV having one input value, 3335 // coming from the remainder loop. We need those PHIs to also have a correct 3336 // value for the IV when arriving directly from the middle block. 3337 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3338 const InductionDescriptor &II, 3339 Value *CountRoundDown, Value *EndValue, 3340 BasicBlock *MiddleBlock) { 3341 // There are two kinds of external IV usages - those that use the value 3342 // computed in the last iteration (the PHI) and those that use the penultimate 3343 // value (the value that feeds into the phi from the loop latch). 3344 // We allow both, but they, obviously, have different values. 3345 3346 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3347 3348 DenseMap<Value *, Value *> MissingVals; 3349 3350 // An external user of the last iteration's value should see the value that 3351 // the remainder loop uses to initialize its own IV. 3352 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3353 for (User *U : PostInc->users()) { 3354 Instruction *UI = cast<Instruction>(U); 3355 if (!OrigLoop->contains(UI)) { 3356 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3357 MissingVals[UI] = EndValue; 3358 } 3359 } 3360 3361 // An external user of the penultimate value need to see EndValue - Step. 3362 // The simplest way to get this is to recompute it from the constituent SCEVs, 3363 // that is Start + (Step * (CRD - 1)). 3364 for (User *U : OrigPhi->users()) { 3365 auto *UI = cast<Instruction>(U); 3366 if (!OrigLoop->contains(UI)) { 3367 const DataLayout &DL = 3368 OrigLoop->getHeader()->getModule()->getDataLayout(); 3369 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3370 3371 IRBuilder<> B(MiddleBlock->getTerminator()); 3372 Value *CountMinusOne = B.CreateSub( 3373 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3374 Value *CMO = 3375 !II.getStep()->getType()->isIntegerTy() 3376 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3377 II.getStep()->getType()) 3378 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3379 CMO->setName("cast.cmo"); 3380 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3381 Escape->setName("ind.escape"); 3382 MissingVals[UI] = Escape; 3383 } 3384 } 3385 3386 for (auto &I : MissingVals) { 3387 PHINode *PHI = cast<PHINode>(I.first); 3388 // One corner case we have to handle is two IVs "chasing" each-other, 3389 // that is %IV2 = phi [...], [ %IV1, %latch ] 3390 // In this case, if IV1 has an external use, we need to avoid adding both 3391 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3392 // don't already have an incoming value for the middle block. 3393 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3394 PHI->addIncoming(I.second, MiddleBlock); 3395 } 3396 } 3397 3398 namespace { 3399 3400 struct CSEDenseMapInfo { 3401 static bool canHandle(const Instruction *I) { 3402 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3403 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3404 } 3405 3406 static inline Instruction *getEmptyKey() { 3407 return DenseMapInfo<Instruction *>::getEmptyKey(); 3408 } 3409 3410 static inline Instruction *getTombstoneKey() { 3411 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3412 } 3413 3414 static unsigned getHashValue(const Instruction *I) { 3415 assert(canHandle(I) && "Unknown instruction!"); 3416 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3417 I->value_op_end())); 3418 } 3419 3420 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3421 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3422 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3423 return LHS == RHS; 3424 return LHS->isIdenticalTo(RHS); 3425 } 3426 }; 3427 3428 } // end anonymous namespace 3429 3430 ///Perform cse of induction variable instructions. 3431 static void cse(BasicBlock *BB) { 3432 // Perform simple cse. 3433 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3434 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3435 Instruction *In = &*I++; 3436 3437 if (!CSEDenseMapInfo::canHandle(In)) 3438 continue; 3439 3440 // Check if we can replace this instruction with any of the 3441 // visited instructions. 3442 if (Instruction *V = CSEMap.lookup(In)) { 3443 In->replaceAllUsesWith(V); 3444 In->eraseFromParent(); 3445 continue; 3446 } 3447 3448 CSEMap[In] = In; 3449 } 3450 } 3451 3452 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3453 ElementCount VF, 3454 bool &NeedToScalarize) { 3455 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3456 Function *F = CI->getCalledFunction(); 3457 Type *ScalarRetTy = CI->getType(); 3458 SmallVector<Type *, 4> Tys, ScalarTys; 3459 for (auto &ArgOp : CI->arg_operands()) 3460 ScalarTys.push_back(ArgOp->getType()); 3461 3462 // Estimate cost of scalarized vector call. The source operands are assumed 3463 // to be vectors, so we need to extract individual elements from there, 3464 // execute VF scalar calls, and then gather the result into the vector return 3465 // value. 3466 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3467 TTI::TCK_RecipThroughput); 3468 if (VF.isScalar()) 3469 return ScalarCallCost; 3470 3471 // Compute corresponding vector type for return value and arguments. 3472 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3473 for (Type *ScalarTy : ScalarTys) 3474 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3475 3476 // Compute costs of unpacking argument values for the scalar calls and 3477 // packing the return values to a vector. 3478 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3479 3480 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3481 3482 // If we can't emit a vector call for this function, then the currently found 3483 // cost is the cost we need to return. 3484 NeedToScalarize = true; 3485 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3486 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3487 3488 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3489 return Cost; 3490 3491 // If the corresponding vector cost is cheaper, return its cost. 3492 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3493 TTI::TCK_RecipThroughput); 3494 if (VectorCallCost < Cost) { 3495 NeedToScalarize = false; 3496 return VectorCallCost; 3497 } 3498 return Cost; 3499 } 3500 3501 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3502 ElementCount VF) { 3503 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3504 assert(ID && "Expected intrinsic call!"); 3505 3506 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3507 return TTI.getIntrinsicInstrCost(CostAttrs, 3508 TargetTransformInfo::TCK_RecipThroughput); 3509 } 3510 3511 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3512 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3513 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3514 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3515 } 3516 3517 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3518 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3519 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3520 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3521 } 3522 3523 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3524 // For every instruction `I` in MinBWs, truncate the operands, create a 3525 // truncated version of `I` and reextend its result. InstCombine runs 3526 // later and will remove any ext/trunc pairs. 3527 SmallPtrSet<Value *, 4> Erased; 3528 for (const auto &KV : Cost->getMinimalBitwidths()) { 3529 // If the value wasn't vectorized, we must maintain the original scalar 3530 // type. The absence of the value from VectorLoopValueMap indicates that it 3531 // wasn't vectorized. 3532 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3533 continue; 3534 for (unsigned Part = 0; Part < UF; ++Part) { 3535 Value *I = getOrCreateVectorValue(KV.first, Part); 3536 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3537 continue; 3538 Type *OriginalTy = I->getType(); 3539 Type *ScalarTruncatedTy = 3540 IntegerType::get(OriginalTy->getContext(), KV.second); 3541 auto *TruncatedTy = FixedVectorType::get( 3542 ScalarTruncatedTy, 3543 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3544 if (TruncatedTy == OriginalTy) 3545 continue; 3546 3547 IRBuilder<> B(cast<Instruction>(I)); 3548 auto ShrinkOperand = [&](Value *V) -> Value * { 3549 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3550 if (ZI->getSrcTy() == TruncatedTy) 3551 return ZI->getOperand(0); 3552 return B.CreateZExtOrTrunc(V, TruncatedTy); 3553 }; 3554 3555 // The actual instruction modification depends on the instruction type, 3556 // unfortunately. 3557 Value *NewI = nullptr; 3558 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3559 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3560 ShrinkOperand(BO->getOperand(1))); 3561 3562 // Any wrapping introduced by shrinking this operation shouldn't be 3563 // considered undefined behavior. So, we can't unconditionally copy 3564 // arithmetic wrapping flags to NewI. 3565 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3566 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3567 NewI = 3568 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3569 ShrinkOperand(CI->getOperand(1))); 3570 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3571 NewI = B.CreateSelect(SI->getCondition(), 3572 ShrinkOperand(SI->getTrueValue()), 3573 ShrinkOperand(SI->getFalseValue())); 3574 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3575 switch (CI->getOpcode()) { 3576 default: 3577 llvm_unreachable("Unhandled cast!"); 3578 case Instruction::Trunc: 3579 NewI = ShrinkOperand(CI->getOperand(0)); 3580 break; 3581 case Instruction::SExt: 3582 NewI = B.CreateSExtOrTrunc( 3583 CI->getOperand(0), 3584 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3585 break; 3586 case Instruction::ZExt: 3587 NewI = B.CreateZExtOrTrunc( 3588 CI->getOperand(0), 3589 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3590 break; 3591 } 3592 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3593 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3594 ->getNumElements(); 3595 auto *O0 = B.CreateZExtOrTrunc( 3596 SI->getOperand(0), 3597 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3598 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3599 ->getNumElements(); 3600 auto *O1 = B.CreateZExtOrTrunc( 3601 SI->getOperand(1), 3602 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3603 3604 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3605 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3606 // Don't do anything with the operands, just extend the result. 3607 continue; 3608 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3609 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3610 ->getNumElements(); 3611 auto *O0 = B.CreateZExtOrTrunc( 3612 IE->getOperand(0), 3613 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3617 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3618 ->getNumElements(); 3619 auto *O0 = B.CreateZExtOrTrunc( 3620 EE->getOperand(0), 3621 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3622 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3623 } else { 3624 // If we don't know what to do, be conservative and don't do anything. 3625 continue; 3626 } 3627 3628 // Lastly, extend the result. 3629 NewI->takeName(cast<Instruction>(I)); 3630 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3631 I->replaceAllUsesWith(Res); 3632 cast<Instruction>(I)->eraseFromParent(); 3633 Erased.insert(I); 3634 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3635 } 3636 } 3637 3638 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3639 for (const auto &KV : Cost->getMinimalBitwidths()) { 3640 // If the value wasn't vectorized, we must maintain the original scalar 3641 // type. The absence of the value from VectorLoopValueMap indicates that it 3642 // wasn't vectorized. 3643 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3644 continue; 3645 for (unsigned Part = 0; Part < UF; ++Part) { 3646 Value *I = getOrCreateVectorValue(KV.first, Part); 3647 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3648 if (Inst && Inst->use_empty()) { 3649 Value *NewI = Inst->getOperand(0); 3650 Inst->eraseFromParent(); 3651 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3652 } 3653 } 3654 } 3655 } 3656 3657 void InnerLoopVectorizer::fixVectorizedLoop() { 3658 // Insert truncates and extends for any truncated instructions as hints to 3659 // InstCombine. 3660 if (VF.isVector()) 3661 truncateToMinimalBitwidths(); 3662 3663 // Fix widened non-induction PHIs by setting up the PHI operands. 3664 if (OrigPHIsToFix.size()) { 3665 assert(EnableVPlanNativePath && 3666 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3667 fixNonInductionPHIs(); 3668 } 3669 3670 // At this point every instruction in the original loop is widened to a 3671 // vector form. Now we need to fix the recurrences in the loop. These PHI 3672 // nodes are currently empty because we did not want to introduce cycles. 3673 // This is the second stage of vectorizing recurrences. 3674 fixCrossIterationPHIs(); 3675 3676 // Forget the original basic block. 3677 PSE.getSE()->forgetLoop(OrigLoop); 3678 3679 // Fix-up external users of the induction variables. 3680 for (auto &Entry : Legal->getInductionVars()) 3681 fixupIVUsers(Entry.first, Entry.second, 3682 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3683 IVEndValues[Entry.first], LoopMiddleBlock); 3684 3685 fixLCSSAPHIs(); 3686 for (Instruction *PI : PredicatedInstructions) 3687 sinkScalarOperands(&*PI); 3688 3689 // Remove redundant induction instructions. 3690 cse(LoopVectorBody); 3691 3692 // Set/update profile weights for the vector and remainder loops as original 3693 // loop iterations are now distributed among them. Note that original loop 3694 // represented by LoopScalarBody becomes remainder loop after vectorization. 3695 // 3696 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3697 // end up getting slightly roughened result but that should be OK since 3698 // profile is not inherently precise anyway. Note also possible bypass of 3699 // vector code caused by legality checks is ignored, assigning all the weight 3700 // to the vector loop, optimistically. 3701 assert(!VF.isScalable() && 3702 "cannot use scalable ElementCount to determine unroll factor"); 3703 setProfileInfoAfterUnrolling( 3704 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3705 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3706 } 3707 3708 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3709 // In order to support recurrences we need to be able to vectorize Phi nodes. 3710 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3711 // stage #2: We now need to fix the recurrences by adding incoming edges to 3712 // the currently empty PHI nodes. At this point every instruction in the 3713 // original loop is widened to a vector form so we can use them to construct 3714 // the incoming edges. 3715 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3716 // Handle first-order recurrences and reductions that need to be fixed. 3717 if (Legal->isFirstOrderRecurrence(&Phi)) 3718 fixFirstOrderRecurrence(&Phi); 3719 else if (Legal->isReductionVariable(&Phi)) 3720 fixReduction(&Phi); 3721 } 3722 } 3723 3724 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3725 // This is the second phase of vectorizing first-order recurrences. An 3726 // overview of the transformation is described below. Suppose we have the 3727 // following loop. 3728 // 3729 // for (int i = 0; i < n; ++i) 3730 // b[i] = a[i] - a[i - 1]; 3731 // 3732 // There is a first-order recurrence on "a". For this loop, the shorthand 3733 // scalar IR looks like: 3734 // 3735 // scalar.ph: 3736 // s_init = a[-1] 3737 // br scalar.body 3738 // 3739 // scalar.body: 3740 // i = phi [0, scalar.ph], [i+1, scalar.body] 3741 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3742 // s2 = a[i] 3743 // b[i] = s2 - s1 3744 // br cond, scalar.body, ... 3745 // 3746 // In this example, s1 is a recurrence because it's value depends on the 3747 // previous iteration. In the first phase of vectorization, we created a 3748 // temporary value for s1. We now complete the vectorization and produce the 3749 // shorthand vector IR shown below (for VF = 4, UF = 1). 3750 // 3751 // vector.ph: 3752 // v_init = vector(..., ..., ..., a[-1]) 3753 // br vector.body 3754 // 3755 // vector.body 3756 // i = phi [0, vector.ph], [i+4, vector.body] 3757 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3758 // v2 = a[i, i+1, i+2, i+3]; 3759 // v3 = vector(v1(3), v2(0, 1, 2)) 3760 // b[i, i+1, i+2, i+3] = v2 - v3 3761 // br cond, vector.body, middle.block 3762 // 3763 // middle.block: 3764 // x = v2(3) 3765 // br scalar.ph 3766 // 3767 // scalar.ph: 3768 // s_init = phi [x, middle.block], [a[-1], otherwise] 3769 // br scalar.body 3770 // 3771 // After execution completes the vector loop, we extract the next value of 3772 // the recurrence (x) to use as the initial value in the scalar loop. 3773 3774 // Get the original loop preheader and single loop latch. 3775 auto *Preheader = OrigLoop->getLoopPreheader(); 3776 auto *Latch = OrigLoop->getLoopLatch(); 3777 3778 // Get the initial and previous values of the scalar recurrence. 3779 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3780 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3781 3782 // Create a vector from the initial value. 3783 auto *VectorInit = ScalarInit; 3784 if (VF.isVector()) { 3785 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3786 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3787 VectorInit = Builder.CreateInsertElement( 3788 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3789 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3790 } 3791 3792 // We constructed a temporary phi node in the first phase of vectorization. 3793 // This phi node will eventually be deleted. 3794 Builder.SetInsertPoint( 3795 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3796 3797 // Create a phi node for the new recurrence. The current value will either be 3798 // the initial value inserted into a vector or loop-varying vector value. 3799 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3800 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3801 3802 // Get the vectorized previous value of the last part UF - 1. It appears last 3803 // among all unrolled iterations, due to the order of their construction. 3804 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3805 3806 // Find and set the insertion point after the previous value if it is an 3807 // instruction. 3808 BasicBlock::iterator InsertPt; 3809 // Note that the previous value may have been constant-folded so it is not 3810 // guaranteed to be an instruction in the vector loop. 3811 // FIXME: Loop invariant values do not form recurrences. We should deal with 3812 // them earlier. 3813 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3814 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3815 else { 3816 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3817 if (isa<PHINode>(PreviousLastPart)) 3818 // If the previous value is a phi node, we should insert after all the phi 3819 // nodes in the block containing the PHI to avoid breaking basic block 3820 // verification. Note that the basic block may be different to 3821 // LoopVectorBody, in case we predicate the loop. 3822 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3823 else 3824 InsertPt = ++PreviousInst->getIterator(); 3825 } 3826 Builder.SetInsertPoint(&*InsertPt); 3827 3828 // We will construct a vector for the recurrence by combining the values for 3829 // the current and previous iterations. This is the required shuffle mask. 3830 assert(!VF.isScalable()); 3831 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3832 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3833 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3834 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3835 3836 // The vector from which to take the initial value for the current iteration 3837 // (actual or unrolled). Initially, this is the vector phi node. 3838 Value *Incoming = VecPhi; 3839 3840 // Shuffle the current and previous vector and update the vector parts. 3841 for (unsigned Part = 0; Part < UF; ++Part) { 3842 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3843 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3844 auto *Shuffle = 3845 VF.isVector() 3846 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3847 : Incoming; 3848 PhiPart->replaceAllUsesWith(Shuffle); 3849 cast<Instruction>(PhiPart)->eraseFromParent(); 3850 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3851 Incoming = PreviousPart; 3852 } 3853 3854 // Fix the latch value of the new recurrence in the vector loop. 3855 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3856 3857 // Extract the last vector element in the middle block. This will be the 3858 // initial value for the recurrence when jumping to the scalar loop. 3859 auto *ExtractForScalar = Incoming; 3860 if (VF.isVector()) { 3861 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3862 ExtractForScalar = Builder.CreateExtractElement( 3863 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3864 "vector.recur.extract"); 3865 } 3866 // Extract the second last element in the middle block if the 3867 // Phi is used outside the loop. We need to extract the phi itself 3868 // and not the last element (the phi update in the current iteration). This 3869 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3870 // when the scalar loop is not run at all. 3871 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3872 if (VF.isVector()) 3873 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3874 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3875 "vector.recur.extract.for.phi"); 3876 // When loop is unrolled without vectorizing, initialize 3877 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3878 // `Incoming`. This is analogous to the vectorized case above: extracting the 3879 // second last element when VF > 1. 3880 else if (UF > 1) 3881 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3882 3883 // Fix the initial value of the original recurrence in the scalar loop. 3884 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3885 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3886 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3887 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3888 Start->addIncoming(Incoming, BB); 3889 } 3890 3891 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3892 Phi->setName("scalar.recur"); 3893 3894 // Finally, fix users of the recurrence outside the loop. The users will need 3895 // either the last value of the scalar recurrence or the last value of the 3896 // vector recurrence we extracted in the middle block. Since the loop is in 3897 // LCSSA form, we just need to find all the phi nodes for the original scalar 3898 // recurrence in the exit block, and then add an edge for the middle block. 3899 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3900 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3901 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3902 } 3903 } 3904 } 3905 3906 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3907 Constant *Zero = Builder.getInt32(0); 3908 3909 // Get it's reduction variable descriptor. 3910 assert(Legal->isReductionVariable(Phi) && 3911 "Unable to find the reduction variable"); 3912 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3913 3914 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3915 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3916 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3917 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3918 RdxDesc.getMinMaxRecurrenceKind(); 3919 setDebugLocFromInst(Builder, ReductionStartValue); 3920 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3921 3922 // We need to generate a reduction vector from the incoming scalar. 3923 // To do so, we need to generate the 'identity' vector and override 3924 // one of the elements with the incoming scalar reduction. We need 3925 // to do it in the vector-loop preheader. 3926 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3927 3928 // This is the vector-clone of the value that leaves the loop. 3929 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3930 3931 // Find the reduction identity variable. Zero for addition, or, xor, 3932 // one for multiplication, -1 for And. 3933 Value *Identity; 3934 Value *VectorStart; 3935 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3936 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3937 // MinMax reduction have the start value as their identify. 3938 if (VF.isScalar() || IsInLoopReductionPhi) { 3939 VectorStart = Identity = ReductionStartValue; 3940 } else { 3941 VectorStart = Identity = 3942 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3943 } 3944 } else { 3945 // Handle other reduction kinds: 3946 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3947 RK, MinMaxKind, VecTy->getScalarType()); 3948 if (VF.isScalar() || IsInLoopReductionPhi) { 3949 Identity = Iden; 3950 // This vector is the Identity vector where the first element is the 3951 // incoming scalar reduction. 3952 VectorStart = ReductionStartValue; 3953 } else { 3954 Identity = ConstantVector::getSplat(VF, Iden); 3955 3956 // This vector is the Identity vector where the first element is the 3957 // incoming scalar reduction. 3958 VectorStart = 3959 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3960 } 3961 } 3962 3963 // Wrap flags are in general invalid after vectorization, clear them. 3964 clearReductionWrapFlags(RdxDesc); 3965 3966 // Fix the vector-loop phi. 3967 3968 // Reductions do not have to start at zero. They can start with 3969 // any loop invariant values. 3970 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3971 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3972 3973 for (unsigned Part = 0; Part < UF; ++Part) { 3974 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3975 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3976 // Make sure to add the reduction start value only to the 3977 // first unroll part. 3978 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3979 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3980 cast<PHINode>(VecRdxPhi) 3981 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3982 } 3983 3984 // Before each round, move the insertion point right between 3985 // the PHIs and the values we are going to write. 3986 // This allows us to write both PHINodes and the extractelement 3987 // instructions. 3988 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3989 3990 setDebugLocFromInst(Builder, LoopExitInst); 3991 3992 // If tail is folded by masking, the vector value to leave the loop should be 3993 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3994 // instead of the former. For an inloop reduction the reduction will already 3995 // be predicated, and does not need to be handled here. 3996 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 3997 for (unsigned Part = 0; Part < UF; ++Part) { 3998 Value *VecLoopExitInst = 3999 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4000 Value *Sel = nullptr; 4001 for (User *U : VecLoopExitInst->users()) { 4002 if (isa<SelectInst>(U)) { 4003 assert(!Sel && "Reduction exit feeding two selects"); 4004 Sel = U; 4005 } else 4006 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4007 } 4008 assert(Sel && "Reduction exit feeds no select"); 4009 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4010 4011 // If the target can create a predicated operator for the reduction at no 4012 // extra cost in the loop (for example a predicated vadd), it can be 4013 // cheaper for the select to remain in the loop than be sunk out of it, 4014 // and so use the select value for the phi instead of the old 4015 // LoopExitValue. 4016 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4017 if (PreferPredicatedReductionSelect || 4018 TTI->preferPredicatedReductionSelect( 4019 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4020 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4021 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4022 VecRdxPhi->setIncomingValueForBlock( 4023 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4024 } 4025 } 4026 } 4027 4028 // If the vector reduction can be performed in a smaller type, we truncate 4029 // then extend the loop exit value to enable InstCombine to evaluate the 4030 // entire expression in the smaller type. 4031 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4032 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4033 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4034 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4035 Builder.SetInsertPoint( 4036 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4037 VectorParts RdxParts(UF); 4038 for (unsigned Part = 0; Part < UF; ++Part) { 4039 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4040 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4041 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4042 : Builder.CreateZExt(Trunc, VecTy); 4043 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4044 UI != RdxParts[Part]->user_end();) 4045 if (*UI != Trunc) { 4046 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4047 RdxParts[Part] = Extnd; 4048 } else { 4049 ++UI; 4050 } 4051 } 4052 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4053 for (unsigned Part = 0; Part < UF; ++Part) { 4054 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4055 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4056 } 4057 } 4058 4059 // Reduce all of the unrolled parts into a single vector. 4060 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4061 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4062 4063 // The middle block terminator has already been assigned a DebugLoc here (the 4064 // OrigLoop's single latch terminator). We want the whole middle block to 4065 // appear to execute on this line because: (a) it is all compiler generated, 4066 // (b) these instructions are always executed after evaluating the latch 4067 // conditional branch, and (c) other passes may add new predecessors which 4068 // terminate on this line. This is the easiest way to ensure we don't 4069 // accidentally cause an extra step back into the loop while debugging. 4070 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4071 for (unsigned Part = 1; Part < UF; ++Part) { 4072 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4073 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4074 // Floating point operations had to be 'fast' to enable the reduction. 4075 ReducedPartRdx = addFastMathFlag( 4076 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4077 ReducedPartRdx, "bin.rdx"), 4078 RdxDesc.getFastMathFlags()); 4079 else 4080 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4081 RdxPart); 4082 } 4083 4084 // Create the reduction after the loop. Note that inloop reductions create the 4085 // target reduction in the loop using a Reduction recipe. 4086 if (VF.isVector() && !IsInLoopReductionPhi) { 4087 bool NoNaN = Legal->hasFunNoNaNAttr(); 4088 ReducedPartRdx = 4089 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4090 // If the reduction can be performed in a smaller type, we need to extend 4091 // the reduction to the wider type before we branch to the original loop. 4092 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4093 ReducedPartRdx = 4094 RdxDesc.isSigned() 4095 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4096 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4097 } 4098 4099 // Create a phi node that merges control-flow from the backedge-taken check 4100 // block and the middle block. 4101 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4102 LoopScalarPreHeader->getTerminator()); 4103 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4104 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4105 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4106 4107 // Now, we need to fix the users of the reduction variable 4108 // inside and outside of the scalar remainder loop. 4109 // We know that the loop is in LCSSA form. We need to update the 4110 // PHI nodes in the exit blocks. 4111 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4112 // All PHINodes need to have a single entry edge, or two if 4113 // we already fixed them. 4114 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4115 4116 // We found a reduction value exit-PHI. Update it with the 4117 // incoming bypass edge. 4118 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4119 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4120 } // end of the LCSSA phi scan. 4121 4122 // Fix the scalar loop reduction variable with the incoming reduction sum 4123 // from the vector body and from the backedge value. 4124 int IncomingEdgeBlockIdx = 4125 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4126 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4127 // Pick the other block. 4128 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4129 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4130 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4131 } 4132 4133 void InnerLoopVectorizer::clearReductionWrapFlags( 4134 RecurrenceDescriptor &RdxDesc) { 4135 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4136 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4137 RK != RecurrenceDescriptor::RK_IntegerMult) 4138 return; 4139 4140 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4141 assert(LoopExitInstr && "null loop exit instruction"); 4142 SmallVector<Instruction *, 8> Worklist; 4143 SmallPtrSet<Instruction *, 8> Visited; 4144 Worklist.push_back(LoopExitInstr); 4145 Visited.insert(LoopExitInstr); 4146 4147 while (!Worklist.empty()) { 4148 Instruction *Cur = Worklist.pop_back_val(); 4149 if (isa<OverflowingBinaryOperator>(Cur)) 4150 for (unsigned Part = 0; Part < UF; ++Part) { 4151 Value *V = getOrCreateVectorValue(Cur, Part); 4152 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4153 } 4154 4155 for (User *U : Cur->users()) { 4156 Instruction *UI = cast<Instruction>(U); 4157 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4158 Visited.insert(UI).second) 4159 Worklist.push_back(UI); 4160 } 4161 } 4162 } 4163 4164 void InnerLoopVectorizer::fixLCSSAPHIs() { 4165 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4166 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4167 if (LCSSAPhi.getNumIncomingValues() == 1) { 4168 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4169 // Non-instruction incoming values will have only one value. 4170 unsigned LastLane = 0; 4171 if (isa<Instruction>(IncomingValue)) 4172 LastLane = Cost->isUniformAfterVectorization( 4173 cast<Instruction>(IncomingValue), VF) 4174 ? 0 4175 : VF.getKnownMinValue() - 1; 4176 // Can be a loop invariant incoming value or the last scalar value to be 4177 // extracted from the vectorized loop. 4178 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4179 Value *lastIncomingValue = 4180 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4181 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4182 } 4183 } 4184 } 4185 4186 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4187 // The basic block and loop containing the predicated instruction. 4188 auto *PredBB = PredInst->getParent(); 4189 auto *VectorLoop = LI->getLoopFor(PredBB); 4190 4191 // Initialize a worklist with the operands of the predicated instruction. 4192 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4193 4194 // Holds instructions that we need to analyze again. An instruction may be 4195 // reanalyzed if we don't yet know if we can sink it or not. 4196 SmallVector<Instruction *, 8> InstsToReanalyze; 4197 4198 // Returns true if a given use occurs in the predicated block. Phi nodes use 4199 // their operands in their corresponding predecessor blocks. 4200 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4201 auto *I = cast<Instruction>(U.getUser()); 4202 BasicBlock *BB = I->getParent(); 4203 if (auto *Phi = dyn_cast<PHINode>(I)) 4204 BB = Phi->getIncomingBlock( 4205 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4206 return BB == PredBB; 4207 }; 4208 4209 // Iteratively sink the scalarized operands of the predicated instruction 4210 // into the block we created for it. When an instruction is sunk, it's 4211 // operands are then added to the worklist. The algorithm ends after one pass 4212 // through the worklist doesn't sink a single instruction. 4213 bool Changed; 4214 do { 4215 // Add the instructions that need to be reanalyzed to the worklist, and 4216 // reset the changed indicator. 4217 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4218 InstsToReanalyze.clear(); 4219 Changed = false; 4220 4221 while (!Worklist.empty()) { 4222 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4223 4224 // We can't sink an instruction if it is a phi node, is already in the 4225 // predicated block, is not in the loop, or may have side effects. 4226 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4227 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4228 continue; 4229 4230 // It's legal to sink the instruction if all its uses occur in the 4231 // predicated block. Otherwise, there's nothing to do yet, and we may 4232 // need to reanalyze the instruction. 4233 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4234 InstsToReanalyze.push_back(I); 4235 continue; 4236 } 4237 4238 // Move the instruction to the beginning of the predicated block, and add 4239 // it's operands to the worklist. 4240 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4241 Worklist.insert(I->op_begin(), I->op_end()); 4242 4243 // The sinking may have enabled other instructions to be sunk, so we will 4244 // need to iterate. 4245 Changed = true; 4246 } 4247 } while (Changed); 4248 } 4249 4250 void InnerLoopVectorizer::fixNonInductionPHIs() { 4251 for (PHINode *OrigPhi : OrigPHIsToFix) { 4252 PHINode *NewPhi = 4253 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4254 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4255 4256 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4257 predecessors(OrigPhi->getParent())); 4258 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4259 predecessors(NewPhi->getParent())); 4260 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4261 "Scalar and Vector BB should have the same number of predecessors"); 4262 4263 // The insertion point in Builder may be invalidated by the time we get 4264 // here. Force the Builder insertion point to something valid so that we do 4265 // not run into issues during insertion point restore in 4266 // getOrCreateVectorValue calls below. 4267 Builder.SetInsertPoint(NewPhi); 4268 4269 // The predecessor order is preserved and we can rely on mapping between 4270 // scalar and vector block predecessors. 4271 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4272 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4273 4274 // When looking up the new scalar/vector values to fix up, use incoming 4275 // values from original phi. 4276 Value *ScIncV = 4277 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4278 4279 // Scalar incoming value may need a broadcast 4280 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4281 NewPhi->addIncoming(NewIncV, NewPredBB); 4282 } 4283 } 4284 } 4285 4286 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4287 unsigned UF, ElementCount VF, 4288 bool IsPtrLoopInvariant, 4289 SmallBitVector &IsIndexLoopInvariant, 4290 VPTransformState &State) { 4291 // Construct a vector GEP by widening the operands of the scalar GEP as 4292 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4293 // results in a vector of pointers when at least one operand of the GEP 4294 // is vector-typed. Thus, to keep the representation compact, we only use 4295 // vector-typed operands for loop-varying values. 4296 4297 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4298 // If we are vectorizing, but the GEP has only loop-invariant operands, 4299 // the GEP we build (by only using vector-typed operands for 4300 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4301 // produce a vector of pointers, we need to either arbitrarily pick an 4302 // operand to broadcast, or broadcast a clone of the original GEP. 4303 // Here, we broadcast a clone of the original. 4304 // 4305 // TODO: If at some point we decide to scalarize instructions having 4306 // loop-invariant operands, this special case will no longer be 4307 // required. We would add the scalarization decision to 4308 // collectLoopScalars() and teach getVectorValue() to broadcast 4309 // the lane-zero scalar value. 4310 auto *Clone = Builder.Insert(GEP->clone()); 4311 for (unsigned Part = 0; Part < UF; ++Part) { 4312 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4313 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4314 addMetadata(EntryPart, GEP); 4315 } 4316 } else { 4317 // If the GEP has at least one loop-varying operand, we are sure to 4318 // produce a vector of pointers. But if we are only unrolling, we want 4319 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4320 // produce with the code below will be scalar (if VF == 1) or vector 4321 // (otherwise). Note that for the unroll-only case, we still maintain 4322 // values in the vector mapping with initVector, as we do for other 4323 // instructions. 4324 for (unsigned Part = 0; Part < UF; ++Part) { 4325 // The pointer operand of the new GEP. If it's loop-invariant, we 4326 // won't broadcast it. 4327 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4328 : State.get(Operands.getOperand(0), Part); 4329 4330 // Collect all the indices for the new GEP. If any index is 4331 // loop-invariant, we won't broadcast it. 4332 SmallVector<Value *, 4> Indices; 4333 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4334 VPValue *Operand = Operands.getOperand(I); 4335 if (IsIndexLoopInvariant[I - 1]) 4336 Indices.push_back(State.get(Operand, {0, 0})); 4337 else 4338 Indices.push_back(State.get(Operand, Part)); 4339 } 4340 4341 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4342 // but it should be a vector, otherwise. 4343 auto *NewGEP = 4344 GEP->isInBounds() 4345 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4346 Indices) 4347 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4348 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4349 "NewGEP is not a pointer vector"); 4350 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4351 addMetadata(NewGEP, GEP); 4352 } 4353 } 4354 } 4355 4356 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4357 ElementCount VF) { 4358 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4359 PHINode *P = cast<PHINode>(PN); 4360 if (EnableVPlanNativePath) { 4361 // Currently we enter here in the VPlan-native path for non-induction 4362 // PHIs where all control flow is uniform. We simply widen these PHIs. 4363 // Create a vector phi with no operands - the vector phi operands will be 4364 // set at the end of vector code generation. 4365 Type *VecTy = 4366 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4367 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4368 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4369 OrigPHIsToFix.push_back(P); 4370 4371 return; 4372 } 4373 4374 assert(PN->getParent() == OrigLoop->getHeader() && 4375 "Non-header phis should have been handled elsewhere"); 4376 4377 // In order to support recurrences we need to be able to vectorize Phi nodes. 4378 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4379 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4380 // this value when we vectorize all of the instructions that use the PHI. 4381 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4382 for (unsigned Part = 0; Part < UF; ++Part) { 4383 // This is phase one of vectorizing PHIs. 4384 bool ScalarPHI = 4385 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4386 Type *VecTy = 4387 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4388 Value *EntryPart = PHINode::Create( 4389 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4390 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4391 } 4392 return; 4393 } 4394 4395 setDebugLocFromInst(Builder, P); 4396 4397 // This PHINode must be an induction variable. 4398 // Make sure that we know about it. 4399 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4400 4401 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4402 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4403 4404 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4405 // which can be found from the original scalar operations. 4406 switch (II.getKind()) { 4407 case InductionDescriptor::IK_NoInduction: 4408 llvm_unreachable("Unknown induction"); 4409 case InductionDescriptor::IK_IntInduction: 4410 case InductionDescriptor::IK_FpInduction: 4411 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4412 case InductionDescriptor::IK_PtrInduction: { 4413 // Handle the pointer induction variable case. 4414 assert(P->getType()->isPointerTy() && "Unexpected type."); 4415 4416 if (Cost->isScalarAfterVectorization(P, VF)) { 4417 // This is the normalized GEP that starts counting at zero. 4418 Value *PtrInd = 4419 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4420 // Determine the number of scalars we need to generate for each unroll 4421 // iteration. If the instruction is uniform, we only need to generate the 4422 // first lane. Otherwise, we generate all VF values. 4423 unsigned Lanes = 4424 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4427 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4428 Lane + Part * VF.getKnownMinValue()); 4429 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4430 Value *SclrGep = 4431 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4432 SclrGep->setName("next.gep"); 4433 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4434 } 4435 } 4436 return; 4437 } 4438 assert(isa<SCEVConstant>(II.getStep()) && 4439 "Induction step not a SCEV constant!"); 4440 Type *PhiType = II.getStep()->getType(); 4441 4442 // Build a pointer phi 4443 Value *ScalarStartValue = II.getStartValue(); 4444 Type *ScStValueType = ScalarStartValue->getType(); 4445 PHINode *NewPointerPhi = 4446 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4447 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4448 4449 // A pointer induction, performed by using a gep 4450 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4451 Instruction *InductionLoc = LoopLatch->getTerminator(); 4452 const SCEV *ScalarStep = II.getStep(); 4453 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4454 Value *ScalarStepValue = 4455 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4456 Value *InductionGEP = GetElementPtrInst::Create( 4457 ScStValueType->getPointerElementType(), NewPointerPhi, 4458 Builder.CreateMul( 4459 ScalarStepValue, 4460 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4461 "ptr.ind", InductionLoc); 4462 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4463 4464 // Create UF many actual address geps that use the pointer 4465 // phi as base and a vectorized version of the step value 4466 // (<step*0, ..., step*N>) as offset. 4467 for (unsigned Part = 0; Part < UF; ++Part) { 4468 SmallVector<Constant *, 8> Indices; 4469 // Create a vector of consecutive numbers from zero to VF. 4470 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4471 Indices.push_back( 4472 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4473 Constant *StartOffset = ConstantVector::get(Indices); 4474 4475 Value *GEP = Builder.CreateGEP( 4476 ScStValueType->getPointerElementType(), NewPointerPhi, 4477 Builder.CreateMul( 4478 StartOffset, 4479 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4480 "vector.gep")); 4481 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4482 } 4483 } 4484 } 4485 } 4486 4487 /// A helper function for checking whether an integer division-related 4488 /// instruction may divide by zero (in which case it must be predicated if 4489 /// executed conditionally in the scalar code). 4490 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4491 /// Non-zero divisors that are non compile-time constants will not be 4492 /// converted into multiplication, so we will still end up scalarizing 4493 /// the division, but can do so w/o predication. 4494 static bool mayDivideByZero(Instruction &I) { 4495 assert((I.getOpcode() == Instruction::UDiv || 4496 I.getOpcode() == Instruction::SDiv || 4497 I.getOpcode() == Instruction::URem || 4498 I.getOpcode() == Instruction::SRem) && 4499 "Unexpected instruction"); 4500 Value *Divisor = I.getOperand(1); 4501 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4502 return !CInt || CInt->isZero(); 4503 } 4504 4505 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4506 VPTransformState &State) { 4507 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4508 switch (I.getOpcode()) { 4509 case Instruction::Call: 4510 case Instruction::Br: 4511 case Instruction::PHI: 4512 case Instruction::GetElementPtr: 4513 case Instruction::Select: 4514 llvm_unreachable("This instruction is handled by a different recipe."); 4515 case Instruction::UDiv: 4516 case Instruction::SDiv: 4517 case Instruction::SRem: 4518 case Instruction::URem: 4519 case Instruction::Add: 4520 case Instruction::FAdd: 4521 case Instruction::Sub: 4522 case Instruction::FSub: 4523 case Instruction::FNeg: 4524 case Instruction::Mul: 4525 case Instruction::FMul: 4526 case Instruction::FDiv: 4527 case Instruction::FRem: 4528 case Instruction::Shl: 4529 case Instruction::LShr: 4530 case Instruction::AShr: 4531 case Instruction::And: 4532 case Instruction::Or: 4533 case Instruction::Xor: { 4534 // Just widen unops and binops. 4535 setDebugLocFromInst(Builder, &I); 4536 4537 for (unsigned Part = 0; Part < UF; ++Part) { 4538 SmallVector<Value *, 2> Ops; 4539 for (VPValue *VPOp : User.operands()) 4540 Ops.push_back(State.get(VPOp, Part)); 4541 4542 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4543 4544 if (auto *VecOp = dyn_cast<Instruction>(V)) 4545 VecOp->copyIRFlags(&I); 4546 4547 // Use this vector value for all users of the original instruction. 4548 VectorLoopValueMap.setVectorValue(&I, Part, V); 4549 addMetadata(V, &I); 4550 } 4551 4552 break; 4553 } 4554 case Instruction::ICmp: 4555 case Instruction::FCmp: { 4556 // Widen compares. Generate vector compares. 4557 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4558 auto *Cmp = cast<CmpInst>(&I); 4559 setDebugLocFromInst(Builder, Cmp); 4560 for (unsigned Part = 0; Part < UF; ++Part) { 4561 Value *A = State.get(User.getOperand(0), Part); 4562 Value *B = State.get(User.getOperand(1), Part); 4563 Value *C = nullptr; 4564 if (FCmp) { 4565 // Propagate fast math flags. 4566 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4567 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4568 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4569 } else { 4570 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4571 } 4572 VectorLoopValueMap.setVectorValue(&I, Part, C); 4573 addMetadata(C, &I); 4574 } 4575 4576 break; 4577 } 4578 4579 case Instruction::ZExt: 4580 case Instruction::SExt: 4581 case Instruction::FPToUI: 4582 case Instruction::FPToSI: 4583 case Instruction::FPExt: 4584 case Instruction::PtrToInt: 4585 case Instruction::IntToPtr: 4586 case Instruction::SIToFP: 4587 case Instruction::UIToFP: 4588 case Instruction::Trunc: 4589 case Instruction::FPTrunc: 4590 case Instruction::BitCast: { 4591 auto *CI = cast<CastInst>(&I); 4592 setDebugLocFromInst(Builder, CI); 4593 4594 /// Vectorize casts. 4595 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4596 Type *DestTy = 4597 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4598 4599 for (unsigned Part = 0; Part < UF; ++Part) { 4600 Value *A = State.get(User.getOperand(0), Part); 4601 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4602 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4603 addMetadata(Cast, &I); 4604 } 4605 break; 4606 } 4607 default: 4608 // This instruction is not vectorized by simple widening. 4609 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4610 llvm_unreachable("Unhandled instruction!"); 4611 } // end of switch. 4612 } 4613 4614 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4615 VPTransformState &State) { 4616 assert(!isa<DbgInfoIntrinsic>(I) && 4617 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4618 setDebugLocFromInst(Builder, &I); 4619 4620 Module *M = I.getParent()->getParent()->getParent(); 4621 auto *CI = cast<CallInst>(&I); 4622 4623 SmallVector<Type *, 4> Tys; 4624 for (Value *ArgOperand : CI->arg_operands()) 4625 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4626 4627 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4628 4629 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4630 // version of the instruction. 4631 // Is it beneficial to perform intrinsic call compared to lib call? 4632 bool NeedToScalarize = false; 4633 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4634 bool UseVectorIntrinsic = 4635 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4636 assert((UseVectorIntrinsic || !NeedToScalarize) && 4637 "Instruction should be scalarized elsewhere."); 4638 4639 for (unsigned Part = 0; Part < UF; ++Part) { 4640 SmallVector<Value *, 4> Args; 4641 for (auto &I : enumerate(ArgOperands.operands())) { 4642 // Some intrinsics have a scalar argument - don't replace it with a 4643 // vector. 4644 Value *Arg; 4645 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4646 Arg = State.get(I.value(), Part); 4647 else 4648 Arg = State.get(I.value(), {0, 0}); 4649 Args.push_back(Arg); 4650 } 4651 4652 Function *VectorF; 4653 if (UseVectorIntrinsic) { 4654 // Use vector version of the intrinsic. 4655 Type *TysForDecl[] = {CI->getType()}; 4656 if (VF.isVector()) { 4657 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4658 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4659 } 4660 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4661 assert(VectorF && "Can't retrieve vector intrinsic."); 4662 } else { 4663 // Use vector version of the function call. 4664 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4665 #ifndef NDEBUG 4666 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4667 "Can't create vector function."); 4668 #endif 4669 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4670 } 4671 SmallVector<OperandBundleDef, 1> OpBundles; 4672 CI->getOperandBundlesAsDefs(OpBundles); 4673 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4674 4675 if (isa<FPMathOperator>(V)) 4676 V->copyFastMathFlags(CI); 4677 4678 VectorLoopValueMap.setVectorValue(&I, Part, V); 4679 addMetadata(V, &I); 4680 } 4681 } 4682 4683 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4684 VPUser &Operands, 4685 bool InvariantCond, 4686 VPTransformState &State) { 4687 setDebugLocFromInst(Builder, &I); 4688 4689 // The condition can be loop invariant but still defined inside the 4690 // loop. This means that we can't just use the original 'cond' value. 4691 // We have to take the 'vectorized' value and pick the first lane. 4692 // Instcombine will make this a no-op. 4693 auto *InvarCond = 4694 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4695 4696 for (unsigned Part = 0; Part < UF; ++Part) { 4697 Value *Cond = 4698 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4699 Value *Op0 = State.get(Operands.getOperand(1), Part); 4700 Value *Op1 = State.get(Operands.getOperand(2), Part); 4701 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4702 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4703 addMetadata(Sel, &I); 4704 } 4705 } 4706 4707 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4708 // We should not collect Scalars more than once per VF. Right now, this 4709 // function is called from collectUniformsAndScalars(), which already does 4710 // this check. Collecting Scalars for VF=1 does not make any sense. 4711 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4712 "This function should not be visited twice for the same VF"); 4713 4714 SmallSetVector<Instruction *, 8> Worklist; 4715 4716 // These sets are used to seed the analysis with pointers used by memory 4717 // accesses that will remain scalar. 4718 SmallSetVector<Instruction *, 8> ScalarPtrs; 4719 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4720 auto *Latch = TheLoop->getLoopLatch(); 4721 4722 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4723 // The pointer operands of loads and stores will be scalar as long as the 4724 // memory access is not a gather or scatter operation. The value operand of a 4725 // store will remain scalar if the store is scalarized. 4726 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4727 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4728 assert(WideningDecision != CM_Unknown && 4729 "Widening decision should be ready at this moment"); 4730 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4731 if (Ptr == Store->getValueOperand()) 4732 return WideningDecision == CM_Scalarize; 4733 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4734 "Ptr is neither a value or pointer operand"); 4735 return WideningDecision != CM_GatherScatter; 4736 }; 4737 4738 // A helper that returns true if the given value is a bitcast or 4739 // getelementptr instruction contained in the loop. 4740 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4741 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4742 isa<GetElementPtrInst>(V)) && 4743 !TheLoop->isLoopInvariant(V); 4744 }; 4745 4746 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4747 if (!isa<PHINode>(Ptr) || 4748 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4749 return false; 4750 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4751 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4752 return false; 4753 return isScalarUse(MemAccess, Ptr); 4754 }; 4755 4756 // A helper that evaluates a memory access's use of a pointer. If the 4757 // pointer is actually the pointer induction of a loop, it is being 4758 // inserted into Worklist. If the use will be a scalar use, and the 4759 // pointer is only used by memory accesses, we place the pointer in 4760 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4761 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4762 if (isScalarPtrInduction(MemAccess, Ptr)) { 4763 Worklist.insert(cast<Instruction>(Ptr)); 4764 Instruction *Update = cast<Instruction>( 4765 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4766 Worklist.insert(Update); 4767 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4768 << "\n"); 4769 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4770 << "\n"); 4771 return; 4772 } 4773 // We only care about bitcast and getelementptr instructions contained in 4774 // the loop. 4775 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4776 return; 4777 4778 // If the pointer has already been identified as scalar (e.g., if it was 4779 // also identified as uniform), there's nothing to do. 4780 auto *I = cast<Instruction>(Ptr); 4781 if (Worklist.count(I)) 4782 return; 4783 4784 // If the use of the pointer will be a scalar use, and all users of the 4785 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4786 // place the pointer in PossibleNonScalarPtrs. 4787 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4788 return isa<LoadInst>(U) || isa<StoreInst>(U); 4789 })) 4790 ScalarPtrs.insert(I); 4791 else 4792 PossibleNonScalarPtrs.insert(I); 4793 }; 4794 4795 // We seed the scalars analysis with three classes of instructions: (1) 4796 // instructions marked uniform-after-vectorization and (2) bitcast, 4797 // getelementptr and (pointer) phi instructions used by memory accesses 4798 // requiring a scalar use. 4799 // 4800 // (1) Add to the worklist all instructions that have been identified as 4801 // uniform-after-vectorization. 4802 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4803 4804 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4805 // memory accesses requiring a scalar use. The pointer operands of loads and 4806 // stores will be scalar as long as the memory accesses is not a gather or 4807 // scatter operation. The value operand of a store will remain scalar if the 4808 // store is scalarized. 4809 for (auto *BB : TheLoop->blocks()) 4810 for (auto &I : *BB) { 4811 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4812 evaluatePtrUse(Load, Load->getPointerOperand()); 4813 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4814 evaluatePtrUse(Store, Store->getPointerOperand()); 4815 evaluatePtrUse(Store, Store->getValueOperand()); 4816 } 4817 } 4818 for (auto *I : ScalarPtrs) 4819 if (!PossibleNonScalarPtrs.count(I)) { 4820 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4821 Worklist.insert(I); 4822 } 4823 4824 // Insert the forced scalars. 4825 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4826 // induction variable when the PHI user is scalarized. 4827 auto ForcedScalar = ForcedScalars.find(VF); 4828 if (ForcedScalar != ForcedScalars.end()) 4829 for (auto *I : ForcedScalar->second) 4830 Worklist.insert(I); 4831 4832 // Expand the worklist by looking through any bitcasts and getelementptr 4833 // instructions we've already identified as scalar. This is similar to the 4834 // expansion step in collectLoopUniforms(); however, here we're only 4835 // expanding to include additional bitcasts and getelementptr instructions. 4836 unsigned Idx = 0; 4837 while (Idx != Worklist.size()) { 4838 Instruction *Dst = Worklist[Idx++]; 4839 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4840 continue; 4841 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4842 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4843 auto *J = cast<Instruction>(U); 4844 return !TheLoop->contains(J) || Worklist.count(J) || 4845 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4846 isScalarUse(J, Src)); 4847 })) { 4848 Worklist.insert(Src); 4849 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4850 } 4851 } 4852 4853 // An induction variable will remain scalar if all users of the induction 4854 // variable and induction variable update remain scalar. 4855 for (auto &Induction : Legal->getInductionVars()) { 4856 auto *Ind = Induction.first; 4857 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4858 4859 // If tail-folding is applied, the primary induction variable will be used 4860 // to feed a vector compare. 4861 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4862 continue; 4863 4864 // Determine if all users of the induction variable are scalar after 4865 // vectorization. 4866 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4867 auto *I = cast<Instruction>(U); 4868 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4869 }); 4870 if (!ScalarInd) 4871 continue; 4872 4873 // Determine if all users of the induction variable update instruction are 4874 // scalar after vectorization. 4875 auto ScalarIndUpdate = 4876 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4877 auto *I = cast<Instruction>(U); 4878 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4879 }); 4880 if (!ScalarIndUpdate) 4881 continue; 4882 4883 // The induction variable and its update instruction will remain scalar. 4884 Worklist.insert(Ind); 4885 Worklist.insert(IndUpdate); 4886 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4887 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4888 << "\n"); 4889 } 4890 4891 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4892 } 4893 4894 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4895 ElementCount VF) { 4896 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4897 if (!blockNeedsPredication(I->getParent())) 4898 return false; 4899 switch(I->getOpcode()) { 4900 default: 4901 break; 4902 case Instruction::Load: 4903 case Instruction::Store: { 4904 if (!Legal->isMaskRequired(I)) 4905 return false; 4906 auto *Ptr = getLoadStorePointerOperand(I); 4907 auto *Ty = getMemInstValueType(I); 4908 // We have already decided how to vectorize this instruction, get that 4909 // result. 4910 if (VF.isVector()) { 4911 InstWidening WideningDecision = getWideningDecision(I, VF); 4912 assert(WideningDecision != CM_Unknown && 4913 "Widening decision should be ready at this moment"); 4914 return WideningDecision == CM_Scalarize; 4915 } 4916 const Align Alignment = getLoadStoreAlignment(I); 4917 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4918 isLegalMaskedGather(Ty, Alignment)) 4919 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4920 isLegalMaskedScatter(Ty, Alignment)); 4921 } 4922 case Instruction::UDiv: 4923 case Instruction::SDiv: 4924 case Instruction::SRem: 4925 case Instruction::URem: 4926 return mayDivideByZero(*I); 4927 } 4928 return false; 4929 } 4930 4931 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4932 Instruction *I, ElementCount VF) { 4933 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4934 assert(getWideningDecision(I, VF) == CM_Unknown && 4935 "Decision should not be set yet."); 4936 auto *Group = getInterleavedAccessGroup(I); 4937 assert(Group && "Must have a group."); 4938 4939 // If the instruction's allocated size doesn't equal it's type size, it 4940 // requires padding and will be scalarized. 4941 auto &DL = I->getModule()->getDataLayout(); 4942 auto *ScalarTy = getMemInstValueType(I); 4943 if (hasIrregularType(ScalarTy, DL, VF)) 4944 return false; 4945 4946 // Check if masking is required. 4947 // A Group may need masking for one of two reasons: it resides in a block that 4948 // needs predication, or it was decided to use masking to deal with gaps. 4949 bool PredicatedAccessRequiresMasking = 4950 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4951 bool AccessWithGapsRequiresMasking = 4952 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4953 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4954 return true; 4955 4956 // If masked interleaving is required, we expect that the user/target had 4957 // enabled it, because otherwise it either wouldn't have been created or 4958 // it should have been invalidated by the CostModel. 4959 assert(useMaskedInterleavedAccesses(TTI) && 4960 "Masked interleave-groups for predicated accesses are not enabled."); 4961 4962 auto *Ty = getMemInstValueType(I); 4963 const Align Alignment = getLoadStoreAlignment(I); 4964 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4965 : TTI.isLegalMaskedStore(Ty, Alignment); 4966 } 4967 4968 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4969 Instruction *I, ElementCount VF) { 4970 // Get and ensure we have a valid memory instruction. 4971 LoadInst *LI = dyn_cast<LoadInst>(I); 4972 StoreInst *SI = dyn_cast<StoreInst>(I); 4973 assert((LI || SI) && "Invalid memory instruction"); 4974 4975 auto *Ptr = getLoadStorePointerOperand(I); 4976 4977 // In order to be widened, the pointer should be consecutive, first of all. 4978 if (!Legal->isConsecutivePtr(Ptr)) 4979 return false; 4980 4981 // If the instruction is a store located in a predicated block, it will be 4982 // scalarized. 4983 if (isScalarWithPredication(I)) 4984 return false; 4985 4986 // If the instruction's allocated size doesn't equal it's type size, it 4987 // requires padding and will be scalarized. 4988 auto &DL = I->getModule()->getDataLayout(); 4989 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4990 if (hasIrregularType(ScalarTy, DL, VF)) 4991 return false; 4992 4993 return true; 4994 } 4995 4996 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4997 // We should not collect Uniforms more than once per VF. Right now, 4998 // this function is called from collectUniformsAndScalars(), which 4999 // already does this check. Collecting Uniforms for VF=1 does not make any 5000 // sense. 5001 5002 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5003 "This function should not be visited twice for the same VF"); 5004 5005 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5006 // not analyze again. Uniforms.count(VF) will return 1. 5007 Uniforms[VF].clear(); 5008 5009 // We now know that the loop is vectorizable! 5010 // Collect instructions inside the loop that will remain uniform after 5011 // vectorization. 5012 5013 // Global values, params and instructions outside of current loop are out of 5014 // scope. 5015 auto isOutOfScope = [&](Value *V) -> bool { 5016 Instruction *I = dyn_cast<Instruction>(V); 5017 return (!I || !TheLoop->contains(I)); 5018 }; 5019 5020 SetVector<Instruction *> Worklist; 5021 BasicBlock *Latch = TheLoop->getLoopLatch(); 5022 5023 // Instructions that are scalar with predication must not be considered 5024 // uniform after vectorization, because that would create an erroneous 5025 // replicating region where only a single instance out of VF should be formed. 5026 // TODO: optimize such seldom cases if found important, see PR40816. 5027 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5028 if (isScalarWithPredication(I, VF)) { 5029 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5030 << *I << "\n"); 5031 return; 5032 } 5033 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5034 Worklist.insert(I); 5035 }; 5036 5037 // Start with the conditional branch. If the branch condition is an 5038 // instruction contained in the loop that is only used by the branch, it is 5039 // uniform. 5040 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5041 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5042 addToWorklistIfAllowed(Cmp); 5043 5044 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5045 // are pointers that are treated like consecutive pointers during 5046 // vectorization. The pointer operands of interleaved accesses are an 5047 // example. 5048 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5049 5050 // Holds pointer operands of instructions that are possibly non-uniform. 5051 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5052 5053 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5054 InstWidening WideningDecision = getWideningDecision(I, VF); 5055 assert(WideningDecision != CM_Unknown && 5056 "Widening decision should be ready at this moment"); 5057 5058 return (WideningDecision == CM_Widen || 5059 WideningDecision == CM_Widen_Reverse || 5060 WideningDecision == CM_Interleave); 5061 }; 5062 // Iterate over the instructions in the loop, and collect all 5063 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5064 // that a consecutive-like pointer operand will be scalarized, we collect it 5065 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5066 // getelementptr instruction can be used by both vectorized and scalarized 5067 // memory instructions. For example, if a loop loads and stores from the same 5068 // location, but the store is conditional, the store will be scalarized, and 5069 // the getelementptr won't remain uniform. 5070 for (auto *BB : TheLoop->blocks()) 5071 for (auto &I : *BB) { 5072 // If there's no pointer operand, there's nothing to do. 5073 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5074 if (!Ptr) 5075 continue; 5076 5077 // True if all users of Ptr are memory accesses that have Ptr as their 5078 // pointer operand. 5079 auto UsersAreMemAccesses = 5080 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5081 return getLoadStorePointerOperand(U) == Ptr; 5082 }); 5083 5084 // Ensure the memory instruction will not be scalarized or used by 5085 // gather/scatter, making its pointer operand non-uniform. If the pointer 5086 // operand is used by any instruction other than a memory access, we 5087 // conservatively assume the pointer operand may be non-uniform. 5088 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5089 PossibleNonUniformPtrs.insert(Ptr); 5090 5091 // If the memory instruction will be vectorized and its pointer operand 5092 // is consecutive-like, or interleaving - the pointer operand should 5093 // remain uniform. 5094 else 5095 ConsecutiveLikePtrs.insert(Ptr); 5096 } 5097 5098 // Add to the Worklist all consecutive and consecutive-like pointers that 5099 // aren't also identified as possibly non-uniform. 5100 for (auto *V : ConsecutiveLikePtrs) 5101 if (!PossibleNonUniformPtrs.count(V)) 5102 addToWorklistIfAllowed(V); 5103 5104 // Expand Worklist in topological order: whenever a new instruction 5105 // is added , its users should be already inside Worklist. It ensures 5106 // a uniform instruction will only be used by uniform instructions. 5107 unsigned idx = 0; 5108 while (idx != Worklist.size()) { 5109 Instruction *I = Worklist[idx++]; 5110 5111 for (auto OV : I->operand_values()) { 5112 // isOutOfScope operands cannot be uniform instructions. 5113 if (isOutOfScope(OV)) 5114 continue; 5115 // First order recurrence Phi's should typically be considered 5116 // non-uniform. 5117 auto *OP = dyn_cast<PHINode>(OV); 5118 if (OP && Legal->isFirstOrderRecurrence(OP)) 5119 continue; 5120 // If all the users of the operand are uniform, then add the 5121 // operand into the uniform worklist. 5122 auto *OI = cast<Instruction>(OV); 5123 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5124 auto *J = cast<Instruction>(U); 5125 return Worklist.count(J) || 5126 (OI == getLoadStorePointerOperand(J) && 5127 isUniformDecision(J, VF)); 5128 })) 5129 addToWorklistIfAllowed(OI); 5130 } 5131 } 5132 5133 // Returns true if Ptr is the pointer operand of a memory access instruction 5134 // I, and I is known to not require scalarization. 5135 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5136 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5137 }; 5138 5139 // For an instruction to be added into Worklist above, all its users inside 5140 // the loop should also be in Worklist. However, this condition cannot be 5141 // true for phi nodes that form a cyclic dependence. We must process phi 5142 // nodes separately. An induction variable will remain uniform if all users 5143 // of the induction variable and induction variable update remain uniform. 5144 // The code below handles both pointer and non-pointer induction variables. 5145 for (auto &Induction : Legal->getInductionVars()) { 5146 auto *Ind = Induction.first; 5147 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5148 5149 // Determine if all users of the induction variable are uniform after 5150 // vectorization. 5151 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5152 auto *I = cast<Instruction>(U); 5153 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5154 isVectorizedMemAccessUse(I, Ind); 5155 }); 5156 if (!UniformInd) 5157 continue; 5158 5159 // Determine if all users of the induction variable update instruction are 5160 // uniform after vectorization. 5161 auto UniformIndUpdate = 5162 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5163 auto *I = cast<Instruction>(U); 5164 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5165 isVectorizedMemAccessUse(I, IndUpdate); 5166 }); 5167 if (!UniformIndUpdate) 5168 continue; 5169 5170 // The induction variable and its update instruction will remain uniform. 5171 addToWorklistIfAllowed(Ind); 5172 addToWorklistIfAllowed(IndUpdate); 5173 } 5174 5175 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5176 } 5177 5178 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5179 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5180 5181 if (Legal->getRuntimePointerChecking()->Need) { 5182 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5183 "runtime pointer checks needed. Enable vectorization of this " 5184 "loop with '#pragma clang loop vectorize(enable)' when " 5185 "compiling with -Os/-Oz", 5186 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5187 return true; 5188 } 5189 5190 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5191 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5192 "runtime SCEV checks needed. Enable vectorization of this " 5193 "loop with '#pragma clang loop vectorize(enable)' when " 5194 "compiling with -Os/-Oz", 5195 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5196 return true; 5197 } 5198 5199 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5200 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5201 reportVectorizationFailure("Runtime stride check for small trip count", 5202 "runtime stride == 1 checks needed. Enable vectorization of " 5203 "this loop without such check by compiling with -Os/-Oz", 5204 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5205 return true; 5206 } 5207 5208 return false; 5209 } 5210 5211 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5212 unsigned UserIC) { 5213 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5214 // TODO: It may by useful to do since it's still likely to be dynamically 5215 // uniform if the target can skip. 5216 reportVectorizationFailure( 5217 "Not inserting runtime ptr check for divergent target", 5218 "runtime pointer checks needed. Not enabled for divergent target", 5219 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5220 return None; 5221 } 5222 5223 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5224 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5225 if (TC == 1) { 5226 reportVectorizationFailure("Single iteration (non) loop", 5227 "loop trip count is one, irrelevant for vectorization", 5228 "SingleIterationLoop", ORE, TheLoop); 5229 return None; 5230 } 5231 5232 switch (ScalarEpilogueStatus) { 5233 case CM_ScalarEpilogueAllowed: 5234 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5235 case CM_ScalarEpilogueNotNeededUsePredicate: 5236 LLVM_DEBUG( 5237 dbgs() << "LV: vector predicate hint/switch found.\n" 5238 << "LV: Not allowing scalar epilogue, creating predicated " 5239 << "vector loop.\n"); 5240 break; 5241 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5242 // fallthrough as a special case of OptForSize 5243 case CM_ScalarEpilogueNotAllowedOptSize: 5244 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5245 LLVM_DEBUG( 5246 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5247 else 5248 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5249 << "count.\n"); 5250 5251 // Bail if runtime checks are required, which are not good when optimising 5252 // for size. 5253 if (runtimeChecksRequired()) 5254 return None; 5255 break; 5256 } 5257 5258 // Now try the tail folding 5259 5260 // Invalidate interleave groups that require an epilogue if we can't mask 5261 // the interleave-group. 5262 if (!useMaskedInterleavedAccesses(TTI)) { 5263 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5264 "No decisions should have been taken at this point"); 5265 // Note: There is no need to invalidate any cost modeling decisions here, as 5266 // non where taken so far. 5267 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5268 } 5269 5270 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5271 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5272 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5273 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5274 // Accept MaxVF if we do not have a tail. 5275 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5276 return MaxVF; 5277 } 5278 5279 // If we don't know the precise trip count, or if the trip count that we 5280 // found modulo the vectorization factor is not zero, try to fold the tail 5281 // by masking. 5282 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5283 if (Legal->prepareToFoldTailByMasking()) { 5284 FoldTailByMasking = true; 5285 return MaxVF; 5286 } 5287 5288 // If there was a tail-folding hint/switch, but we can't fold the tail by 5289 // masking, fallback to a vectorization with a scalar epilogue. 5290 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5291 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5292 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5293 return None; 5294 } 5295 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5296 "scalar epilogue instead.\n"); 5297 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5298 return MaxVF; 5299 } 5300 5301 if (TC == 0) { 5302 reportVectorizationFailure( 5303 "Unable to calculate the loop count due to complex control flow", 5304 "unable to calculate the loop count due to complex control flow", 5305 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5306 return None; 5307 } 5308 5309 reportVectorizationFailure( 5310 "Cannot optimize for size and vectorize at the same time.", 5311 "cannot optimize for size and vectorize at the same time. " 5312 "Enable vectorization of this loop with '#pragma clang loop " 5313 "vectorize(enable)' when compiling with -Os/-Oz", 5314 "NoTailLoopWithOptForSize", ORE, TheLoop); 5315 return None; 5316 } 5317 5318 unsigned 5319 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5320 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5321 unsigned SmallestType, WidestType; 5322 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5323 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5324 5325 // Get the maximum safe dependence distance in bits computed by LAA. 5326 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5327 // the memory accesses that is most restrictive (involved in the smallest 5328 // dependence distance). 5329 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5330 5331 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5332 5333 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5334 // Note that both WidestRegister and WidestType may not be a powers of 2. 5335 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5336 5337 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5338 << " / " << WidestType << " bits.\n"); 5339 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5340 << WidestRegister << " bits.\n"); 5341 5342 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5343 " into one vector!"); 5344 if (MaxVectorSize == 0) { 5345 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5346 MaxVectorSize = 1; 5347 return MaxVectorSize; 5348 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5349 isPowerOf2_32(ConstTripCount)) { 5350 // We need to clamp the VF to be the ConstTripCount. There is no point in 5351 // choosing a higher viable VF as done in the loop below. 5352 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5353 << ConstTripCount << "\n"); 5354 MaxVectorSize = ConstTripCount; 5355 return MaxVectorSize; 5356 } 5357 5358 unsigned MaxVF = MaxVectorSize; 5359 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5360 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5361 // Collect all viable vectorization factors larger than the default MaxVF 5362 // (i.e. MaxVectorSize). 5363 SmallVector<ElementCount, 8> VFs; 5364 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5365 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5366 VFs.push_back(ElementCount::getFixed(VS)); 5367 5368 // For each VF calculate its register usage. 5369 auto RUs = calculateRegisterUsage(VFs); 5370 5371 // Select the largest VF which doesn't require more registers than existing 5372 // ones. 5373 for (int i = RUs.size() - 1; i >= 0; --i) { 5374 bool Selected = true; 5375 for (auto& pair : RUs[i].MaxLocalUsers) { 5376 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5377 if (pair.second > TargetNumRegisters) 5378 Selected = false; 5379 } 5380 if (Selected) { 5381 MaxVF = VFs[i].getKnownMinValue(); 5382 break; 5383 } 5384 } 5385 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5386 if (MaxVF < MinVF) { 5387 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5388 << ") with target's minimum: " << MinVF << '\n'); 5389 MaxVF = MinVF; 5390 } 5391 } 5392 } 5393 return MaxVF; 5394 } 5395 5396 VectorizationFactor 5397 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5398 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5399 const float ScalarCost = Cost; 5400 unsigned Width = 1; 5401 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5402 5403 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5404 if (ForceVectorization && MaxVF > 1) { 5405 // Ignore scalar width, because the user explicitly wants vectorization. 5406 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5407 // evaluation. 5408 Cost = std::numeric_limits<float>::max(); 5409 } 5410 5411 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5412 // Notice that the vector loop needs to be executed less times, so 5413 // we need to divide the cost of the vector loops by the width of 5414 // the vector elements. 5415 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5416 float VectorCost = C.first / (float)i; 5417 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5418 << " costs: " << (int)VectorCost << ".\n"); 5419 if (!C.second && !ForceVectorization) { 5420 LLVM_DEBUG( 5421 dbgs() << "LV: Not considering vector loop of width " << i 5422 << " because it will not generate any vector instructions.\n"); 5423 continue; 5424 } 5425 if (VectorCost < Cost) { 5426 Cost = VectorCost; 5427 Width = i; 5428 } 5429 } 5430 5431 if (!EnableCondStoresVectorization && NumPredStores) { 5432 reportVectorizationFailure("There are conditional stores.", 5433 "store that is conditionally executed prevents vectorization", 5434 "ConditionalStore", ORE, TheLoop); 5435 Width = 1; 5436 Cost = ScalarCost; 5437 } 5438 5439 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5440 << "LV: Vectorization seems to be not beneficial, " 5441 << "but was forced by a user.\n"); 5442 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5443 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5444 (unsigned)(Width * Cost)}; 5445 return Factor; 5446 } 5447 5448 std::pair<unsigned, unsigned> 5449 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5450 unsigned MinWidth = -1U; 5451 unsigned MaxWidth = 8; 5452 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5453 5454 // For each block. 5455 for (BasicBlock *BB : TheLoop->blocks()) { 5456 // For each instruction in the loop. 5457 for (Instruction &I : BB->instructionsWithoutDebug()) { 5458 Type *T = I.getType(); 5459 5460 // Skip ignored values. 5461 if (ValuesToIgnore.count(&I)) 5462 continue; 5463 5464 // Only examine Loads, Stores and PHINodes. 5465 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5466 continue; 5467 5468 // Examine PHI nodes that are reduction variables. Update the type to 5469 // account for the recurrence type. 5470 if (auto *PN = dyn_cast<PHINode>(&I)) { 5471 if (!Legal->isReductionVariable(PN)) 5472 continue; 5473 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5474 T = RdxDesc.getRecurrenceType(); 5475 } 5476 5477 // Examine the stored values. 5478 if (auto *ST = dyn_cast<StoreInst>(&I)) 5479 T = ST->getValueOperand()->getType(); 5480 5481 // Ignore loaded pointer types and stored pointer types that are not 5482 // vectorizable. 5483 // 5484 // FIXME: The check here attempts to predict whether a load or store will 5485 // be vectorized. We only know this for certain after a VF has 5486 // been selected. Here, we assume that if an access can be 5487 // vectorized, it will be. We should also look at extending this 5488 // optimization to non-pointer types. 5489 // 5490 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5491 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5492 continue; 5493 5494 MinWidth = std::min(MinWidth, 5495 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5496 MaxWidth = std::max(MaxWidth, 5497 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5498 } 5499 } 5500 5501 return {MinWidth, MaxWidth}; 5502 } 5503 5504 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5505 unsigned LoopCost) { 5506 // -- The interleave heuristics -- 5507 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5508 // There are many micro-architectural considerations that we can't predict 5509 // at this level. For example, frontend pressure (on decode or fetch) due to 5510 // code size, or the number and capabilities of the execution ports. 5511 // 5512 // We use the following heuristics to select the interleave count: 5513 // 1. If the code has reductions, then we interleave to break the cross 5514 // iteration dependency. 5515 // 2. If the loop is really small, then we interleave to reduce the loop 5516 // overhead. 5517 // 3. We don't interleave if we think that we will spill registers to memory 5518 // due to the increased register pressure. 5519 5520 if (!isScalarEpilogueAllowed()) 5521 return 1; 5522 5523 // We used the distance for the interleave count. 5524 if (Legal->getMaxSafeDepDistBytes() != -1U) 5525 return 1; 5526 5527 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5528 const bool HasReductions = !Legal->getReductionVars().empty(); 5529 // Do not interleave loops with a relatively small known or estimated trip 5530 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5531 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5532 // because with the above conditions interleaving can expose ILP and break 5533 // cross iteration dependences for reductions. 5534 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5535 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5536 return 1; 5537 5538 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5539 // We divide by these constants so assume that we have at least one 5540 // instruction that uses at least one register. 5541 for (auto& pair : R.MaxLocalUsers) { 5542 pair.second = std::max(pair.second, 1U); 5543 } 5544 5545 // We calculate the interleave count using the following formula. 5546 // Subtract the number of loop invariants from the number of available 5547 // registers. These registers are used by all of the interleaved instances. 5548 // Next, divide the remaining registers by the number of registers that is 5549 // required by the loop, in order to estimate how many parallel instances 5550 // fit without causing spills. All of this is rounded down if necessary to be 5551 // a power of two. We want power of two interleave count to simplify any 5552 // addressing operations or alignment considerations. 5553 // We also want power of two interleave counts to ensure that the induction 5554 // variable of the vector loop wraps to zero, when tail is folded by masking; 5555 // this currently happens when OptForSize, in which case IC is set to 1 above. 5556 unsigned IC = UINT_MAX; 5557 5558 for (auto& pair : R.MaxLocalUsers) { 5559 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5560 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5561 << " registers of " 5562 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5563 if (VF.isScalar()) { 5564 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5565 TargetNumRegisters = ForceTargetNumScalarRegs; 5566 } else { 5567 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5568 TargetNumRegisters = ForceTargetNumVectorRegs; 5569 } 5570 unsigned MaxLocalUsers = pair.second; 5571 unsigned LoopInvariantRegs = 0; 5572 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5573 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5574 5575 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5576 // Don't count the induction variable as interleaved. 5577 if (EnableIndVarRegisterHeur) { 5578 TmpIC = 5579 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5580 std::max(1U, (MaxLocalUsers - 1))); 5581 } 5582 5583 IC = std::min(IC, TmpIC); 5584 } 5585 5586 // Clamp the interleave ranges to reasonable counts. 5587 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5588 unsigned MaxInterleaveCount = 5589 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5590 5591 // Check if the user has overridden the max. 5592 if (VF.isScalar()) { 5593 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5594 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5595 } else { 5596 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5597 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5598 } 5599 5600 // If trip count is known or estimated compile time constant, limit the 5601 // interleave count to be less than the trip count divided by VF. 5602 if (BestKnownTC) { 5603 MaxInterleaveCount = 5604 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5605 } 5606 5607 // If we did not calculate the cost for VF (because the user selected the VF) 5608 // then we calculate the cost of VF here. 5609 if (LoopCost == 0) 5610 LoopCost = expectedCost(VF).first; 5611 5612 assert(LoopCost && "Non-zero loop cost expected"); 5613 5614 // Clamp the calculated IC to be between the 1 and the max interleave count 5615 // that the target and trip count allows. 5616 if (IC > MaxInterleaveCount) 5617 IC = MaxInterleaveCount; 5618 else if (IC < 1) 5619 IC = 1; 5620 5621 // Interleave if we vectorized this loop and there is a reduction that could 5622 // benefit from interleaving. 5623 if (VF.isVector() && HasReductions) { 5624 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5625 return IC; 5626 } 5627 5628 // Note that if we've already vectorized the loop we will have done the 5629 // runtime check and so interleaving won't require further checks. 5630 bool InterleavingRequiresRuntimePointerCheck = 5631 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5632 5633 // We want to interleave small loops in order to reduce the loop overhead and 5634 // potentially expose ILP opportunities. 5635 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5636 << "LV: IC is " << IC << '\n' 5637 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 5638 const bool AggressivelyInterleaveReductions = 5639 TTI.enableAggressiveInterleaving(HasReductions); 5640 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5641 // We assume that the cost overhead is 1 and we use the cost model 5642 // to estimate the cost of the loop and interleave until the cost of the 5643 // loop overhead is about 5% of the cost of the loop. 5644 unsigned SmallIC = 5645 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5646 5647 // Interleave until store/load ports (estimated by max interleave count) are 5648 // saturated. 5649 unsigned NumStores = Legal->getNumStores(); 5650 unsigned NumLoads = Legal->getNumLoads(); 5651 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5652 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5653 5654 // If we have a scalar reduction (vector reductions are already dealt with 5655 // by this point), we can increase the critical path length if the loop 5656 // we're interleaving is inside another loop. Limit, by default to 2, so the 5657 // critical path only gets increased by one reduction operation. 5658 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5659 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5660 SmallIC = std::min(SmallIC, F); 5661 StoresIC = std::min(StoresIC, F); 5662 LoadsIC = std::min(LoadsIC, F); 5663 } 5664 5665 if (EnableLoadStoreRuntimeInterleave && 5666 std::max(StoresIC, LoadsIC) > SmallIC) { 5667 LLVM_DEBUG( 5668 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5669 return std::max(StoresIC, LoadsIC); 5670 } 5671 5672 // If there are scalar reductions and TTI has enabled aggressive 5673 // interleaving for reductions, we will interleave to expose ILP. 5674 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5675 AggressivelyInterleaveReductions) { 5676 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5677 // Interleave no less than SmallIC but not as aggressive as the normal IC 5678 // to satisfy the rare situation when resources are too limited. 5679 return std::max(IC / 2, SmallIC); 5680 } else { 5681 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5682 return SmallIC; 5683 } 5684 } 5685 5686 // Interleave if this is a large loop (small loops are already dealt with by 5687 // this point) that could benefit from interleaving. 5688 if (AggressivelyInterleaveReductions) { 5689 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5690 return IC; 5691 } 5692 5693 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5694 return 1; 5695 } 5696 5697 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5698 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5699 // This function calculates the register usage by measuring the highest number 5700 // of values that are alive at a single location. Obviously, this is a very 5701 // rough estimation. We scan the loop in a topological order in order and 5702 // assign a number to each instruction. We use RPO to ensure that defs are 5703 // met before their users. We assume that each instruction that has in-loop 5704 // users starts an interval. We record every time that an in-loop value is 5705 // used, so we have a list of the first and last occurrences of each 5706 // instruction. Next, we transpose this data structure into a multi map that 5707 // holds the list of intervals that *end* at a specific location. This multi 5708 // map allows us to perform a linear search. We scan the instructions linearly 5709 // and record each time that a new interval starts, by placing it in a set. 5710 // If we find this value in the multi-map then we remove it from the set. 5711 // The max register usage is the maximum size of the set. 5712 // We also search for instructions that are defined outside the loop, but are 5713 // used inside the loop. We need this number separately from the max-interval 5714 // usage number because when we unroll, loop-invariant values do not take 5715 // more register. 5716 LoopBlocksDFS DFS(TheLoop); 5717 DFS.perform(LI); 5718 5719 RegisterUsage RU; 5720 5721 // Each 'key' in the map opens a new interval. The values 5722 // of the map are the index of the 'last seen' usage of the 5723 // instruction that is the key. 5724 using IntervalMap = DenseMap<Instruction *, unsigned>; 5725 5726 // Maps instruction to its index. 5727 SmallVector<Instruction *, 64> IdxToInstr; 5728 // Marks the end of each interval. 5729 IntervalMap EndPoint; 5730 // Saves the list of instruction indices that are used in the loop. 5731 SmallPtrSet<Instruction *, 8> Ends; 5732 // Saves the list of values that are used in the loop but are 5733 // defined outside the loop, such as arguments and constants. 5734 SmallPtrSet<Value *, 8> LoopInvariants; 5735 5736 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5737 for (Instruction &I : BB->instructionsWithoutDebug()) { 5738 IdxToInstr.push_back(&I); 5739 5740 // Save the end location of each USE. 5741 for (Value *U : I.operands()) { 5742 auto *Instr = dyn_cast<Instruction>(U); 5743 5744 // Ignore non-instruction values such as arguments, constants, etc. 5745 if (!Instr) 5746 continue; 5747 5748 // If this instruction is outside the loop then record it and continue. 5749 if (!TheLoop->contains(Instr)) { 5750 LoopInvariants.insert(Instr); 5751 continue; 5752 } 5753 5754 // Overwrite previous end points. 5755 EndPoint[Instr] = IdxToInstr.size(); 5756 Ends.insert(Instr); 5757 } 5758 } 5759 } 5760 5761 // Saves the list of intervals that end with the index in 'key'. 5762 using InstrList = SmallVector<Instruction *, 2>; 5763 DenseMap<unsigned, InstrList> TransposeEnds; 5764 5765 // Transpose the EndPoints to a list of values that end at each index. 5766 for (auto &Interval : EndPoint) 5767 TransposeEnds[Interval.second].push_back(Interval.first); 5768 5769 SmallPtrSet<Instruction *, 8> OpenIntervals; 5770 5771 // Get the size of the widest register. 5772 unsigned MaxSafeDepDist = -1U; 5773 if (Legal->getMaxSafeDepDistBytes() != -1U) 5774 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5775 unsigned WidestRegister = 5776 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5777 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5778 5779 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5780 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5781 5782 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5783 5784 // A lambda that gets the register usage for the given type and VF. 5785 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5786 if (Ty->isTokenTy()) 5787 return 0U; 5788 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5789 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5790 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5791 WidestRegister); 5792 }; 5793 5794 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5795 Instruction *I = IdxToInstr[i]; 5796 5797 // Remove all of the instructions that end at this location. 5798 InstrList &List = TransposeEnds[i]; 5799 for (Instruction *ToRemove : List) 5800 OpenIntervals.erase(ToRemove); 5801 5802 // Ignore instructions that are never used within the loop. 5803 if (!Ends.count(I)) 5804 continue; 5805 5806 // Skip ignored values. 5807 if (ValuesToIgnore.count(I)) 5808 continue; 5809 5810 // For each VF find the maximum usage of registers. 5811 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5812 // Count the number of live intervals. 5813 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5814 5815 if (VFs[j].isScalar()) { 5816 for (auto Inst : OpenIntervals) { 5817 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5818 if (RegUsage.find(ClassID) == RegUsage.end()) 5819 RegUsage[ClassID] = 1; 5820 else 5821 RegUsage[ClassID] += 1; 5822 } 5823 } else { 5824 collectUniformsAndScalars(VFs[j]); 5825 for (auto Inst : OpenIntervals) { 5826 // Skip ignored values for VF > 1. 5827 if (VecValuesToIgnore.count(Inst)) 5828 continue; 5829 if (isScalarAfterVectorization(Inst, VFs[j])) { 5830 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5831 if (RegUsage.find(ClassID) == RegUsage.end()) 5832 RegUsage[ClassID] = 1; 5833 else 5834 RegUsage[ClassID] += 1; 5835 } else { 5836 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5837 if (RegUsage.find(ClassID) == RegUsage.end()) 5838 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5839 else 5840 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5841 } 5842 } 5843 } 5844 5845 for (auto& pair : RegUsage) { 5846 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5847 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5848 else 5849 MaxUsages[j][pair.first] = pair.second; 5850 } 5851 } 5852 5853 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5854 << OpenIntervals.size() << '\n'); 5855 5856 // Add the current instruction to the list of open intervals. 5857 OpenIntervals.insert(I); 5858 } 5859 5860 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5861 SmallMapVector<unsigned, unsigned, 4> Invariant; 5862 5863 for (auto Inst : LoopInvariants) { 5864 unsigned Usage = 5865 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5866 unsigned ClassID = 5867 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5868 if (Invariant.find(ClassID) == Invariant.end()) 5869 Invariant[ClassID] = Usage; 5870 else 5871 Invariant[ClassID] += Usage; 5872 } 5873 5874 LLVM_DEBUG({ 5875 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5876 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5877 << " item\n"; 5878 for (const auto &pair : MaxUsages[i]) { 5879 dbgs() << "LV(REG): RegisterClass: " 5880 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5881 << " registers\n"; 5882 } 5883 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5884 << " item\n"; 5885 for (const auto &pair : Invariant) { 5886 dbgs() << "LV(REG): RegisterClass: " 5887 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5888 << " registers\n"; 5889 } 5890 }); 5891 5892 RU.LoopInvariantRegs = Invariant; 5893 RU.MaxLocalUsers = MaxUsages[i]; 5894 RUs[i] = RU; 5895 } 5896 5897 return RUs; 5898 } 5899 5900 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5901 // TODO: Cost model for emulated masked load/store is completely 5902 // broken. This hack guides the cost model to use an artificially 5903 // high enough value to practically disable vectorization with such 5904 // operations, except where previously deployed legality hack allowed 5905 // using very low cost values. This is to avoid regressions coming simply 5906 // from moving "masked load/store" check from legality to cost model. 5907 // Masked Load/Gather emulation was previously never allowed. 5908 // Limited number of Masked Store/Scatter emulation was allowed. 5909 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5910 return isa<LoadInst>(I) || 5911 (isa<StoreInst>(I) && 5912 NumPredStores > NumberOfStoresToPredicate); 5913 } 5914 5915 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5916 // If we aren't vectorizing the loop, or if we've already collected the 5917 // instructions to scalarize, there's nothing to do. Collection may already 5918 // have occurred if we have a user-selected VF and are now computing the 5919 // expected cost for interleaving. 5920 if (VF.isScalar() || VF.isZero() || 5921 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5922 return; 5923 5924 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5925 // not profitable to scalarize any instructions, the presence of VF in the 5926 // map will indicate that we've analyzed it already. 5927 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5928 5929 // Find all the instructions that are scalar with predication in the loop and 5930 // determine if it would be better to not if-convert the blocks they are in. 5931 // If so, we also record the instructions to scalarize. 5932 for (BasicBlock *BB : TheLoop->blocks()) { 5933 if (!blockNeedsPredication(BB)) 5934 continue; 5935 for (Instruction &I : *BB) 5936 if (isScalarWithPredication(&I)) { 5937 ScalarCostsTy ScalarCosts; 5938 // Do not apply discount logic if hacked cost is needed 5939 // for emulated masked memrefs. 5940 if (!useEmulatedMaskMemRefHack(&I) && 5941 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5942 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5943 // Remember that BB will remain after vectorization. 5944 PredicatedBBsAfterVectorization.insert(BB); 5945 } 5946 } 5947 } 5948 5949 int LoopVectorizationCostModel::computePredInstDiscount( 5950 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5951 ElementCount VF) { 5952 assert(!isUniformAfterVectorization(PredInst, VF) && 5953 "Instruction marked uniform-after-vectorization will be predicated"); 5954 5955 // Initialize the discount to zero, meaning that the scalar version and the 5956 // vector version cost the same. 5957 int Discount = 0; 5958 5959 // Holds instructions to analyze. The instructions we visit are mapped in 5960 // ScalarCosts. Those instructions are the ones that would be scalarized if 5961 // we find that the scalar version costs less. 5962 SmallVector<Instruction *, 8> Worklist; 5963 5964 // Returns true if the given instruction can be scalarized. 5965 auto canBeScalarized = [&](Instruction *I) -> bool { 5966 // We only attempt to scalarize instructions forming a single-use chain 5967 // from the original predicated block that would otherwise be vectorized. 5968 // Although not strictly necessary, we give up on instructions we know will 5969 // already be scalar to avoid traversing chains that are unlikely to be 5970 // beneficial. 5971 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5972 isScalarAfterVectorization(I, VF)) 5973 return false; 5974 5975 // If the instruction is scalar with predication, it will be analyzed 5976 // separately. We ignore it within the context of PredInst. 5977 if (isScalarWithPredication(I)) 5978 return false; 5979 5980 // If any of the instruction's operands are uniform after vectorization, 5981 // the instruction cannot be scalarized. This prevents, for example, a 5982 // masked load from being scalarized. 5983 // 5984 // We assume we will only emit a value for lane zero of an instruction 5985 // marked uniform after vectorization, rather than VF identical values. 5986 // Thus, if we scalarize an instruction that uses a uniform, we would 5987 // create uses of values corresponding to the lanes we aren't emitting code 5988 // for. This behavior can be changed by allowing getScalarValue to clone 5989 // the lane zero values for uniforms rather than asserting. 5990 for (Use &U : I->operands()) 5991 if (auto *J = dyn_cast<Instruction>(U.get())) 5992 if (isUniformAfterVectorization(J, VF)) 5993 return false; 5994 5995 // Otherwise, we can scalarize the instruction. 5996 return true; 5997 }; 5998 5999 // Compute the expected cost discount from scalarizing the entire expression 6000 // feeding the predicated instruction. We currently only consider expressions 6001 // that are single-use instruction chains. 6002 Worklist.push_back(PredInst); 6003 while (!Worklist.empty()) { 6004 Instruction *I = Worklist.pop_back_val(); 6005 6006 // If we've already analyzed the instruction, there's nothing to do. 6007 if (ScalarCosts.find(I) != ScalarCosts.end()) 6008 continue; 6009 6010 // Compute the cost of the vector instruction. Note that this cost already 6011 // includes the scalarization overhead of the predicated instruction. 6012 unsigned VectorCost = getInstructionCost(I, VF).first; 6013 6014 // Compute the cost of the scalarized instruction. This cost is the cost of 6015 // the instruction as if it wasn't if-converted and instead remained in the 6016 // predicated block. We will scale this cost by block probability after 6017 // computing the scalarization overhead. 6018 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6019 unsigned ScalarCost = 6020 VF.getKnownMinValue() * 6021 getInstructionCost(I, ElementCount::getFixed(1)).first; 6022 6023 // Compute the scalarization overhead of needed insertelement instructions 6024 // and phi nodes. 6025 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6026 ScalarCost += TTI.getScalarizationOverhead( 6027 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6028 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6029 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6030 ScalarCost += 6031 VF.getKnownMinValue() * 6032 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6033 } 6034 6035 // Compute the scalarization overhead of needed extractelement 6036 // instructions. For each of the instruction's operands, if the operand can 6037 // be scalarized, add it to the worklist; otherwise, account for the 6038 // overhead. 6039 for (Use &U : I->operands()) 6040 if (auto *J = dyn_cast<Instruction>(U.get())) { 6041 assert(VectorType::isValidElementType(J->getType()) && 6042 "Instruction has non-scalar type"); 6043 if (canBeScalarized(J)) 6044 Worklist.push_back(J); 6045 else if (needsExtract(J, VF)) { 6046 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6047 ScalarCost += TTI.getScalarizationOverhead( 6048 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6049 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6050 } 6051 } 6052 6053 // Scale the total scalar cost by block probability. 6054 ScalarCost /= getReciprocalPredBlockProb(); 6055 6056 // Compute the discount. A non-negative discount means the vector version 6057 // of the instruction costs more, and scalarizing would be beneficial. 6058 Discount += VectorCost - ScalarCost; 6059 ScalarCosts[I] = ScalarCost; 6060 } 6061 6062 return Discount; 6063 } 6064 6065 LoopVectorizationCostModel::VectorizationCostTy 6066 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6067 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6068 VectorizationCostTy Cost; 6069 6070 // For each block. 6071 for (BasicBlock *BB : TheLoop->blocks()) { 6072 VectorizationCostTy BlockCost; 6073 6074 // For each instruction in the old loop. 6075 for (Instruction &I : BB->instructionsWithoutDebug()) { 6076 // Skip ignored values. 6077 if (ValuesToIgnore.count(&I) || 6078 (VF.isVector() && VecValuesToIgnore.count(&I))) 6079 continue; 6080 6081 VectorizationCostTy C = getInstructionCost(&I, VF); 6082 6083 // Check if we should override the cost. 6084 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6085 C.first = ForceTargetInstructionCost; 6086 6087 BlockCost.first += C.first; 6088 BlockCost.second |= C.second; 6089 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6090 << " for VF " << VF << " For instruction: " << I 6091 << '\n'); 6092 } 6093 6094 // If we are vectorizing a predicated block, it will have been 6095 // if-converted. This means that the block's instructions (aside from 6096 // stores and instructions that may divide by zero) will now be 6097 // unconditionally executed. For the scalar case, we may not always execute 6098 // the predicated block. Thus, scale the block's cost by the probability of 6099 // executing it. 6100 if (VF.isScalar() && blockNeedsPredication(BB)) 6101 BlockCost.first /= getReciprocalPredBlockProb(); 6102 6103 Cost.first += BlockCost.first; 6104 Cost.second |= BlockCost.second; 6105 } 6106 6107 return Cost; 6108 } 6109 6110 /// Gets Address Access SCEV after verifying that the access pattern 6111 /// is loop invariant except the induction variable dependence. 6112 /// 6113 /// This SCEV can be sent to the Target in order to estimate the address 6114 /// calculation cost. 6115 static const SCEV *getAddressAccessSCEV( 6116 Value *Ptr, 6117 LoopVectorizationLegality *Legal, 6118 PredicatedScalarEvolution &PSE, 6119 const Loop *TheLoop) { 6120 6121 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6122 if (!Gep) 6123 return nullptr; 6124 6125 // We are looking for a gep with all loop invariant indices except for one 6126 // which should be an induction variable. 6127 auto SE = PSE.getSE(); 6128 unsigned NumOperands = Gep->getNumOperands(); 6129 for (unsigned i = 1; i < NumOperands; ++i) { 6130 Value *Opd = Gep->getOperand(i); 6131 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6132 !Legal->isInductionVariable(Opd)) 6133 return nullptr; 6134 } 6135 6136 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6137 return PSE.getSCEV(Ptr); 6138 } 6139 6140 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6141 return Legal->hasStride(I->getOperand(0)) || 6142 Legal->hasStride(I->getOperand(1)); 6143 } 6144 6145 unsigned 6146 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6147 ElementCount VF) { 6148 assert(VF.isVector() && 6149 "Scalarization cost of instruction implies vectorization."); 6150 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6151 Type *ValTy = getMemInstValueType(I); 6152 auto SE = PSE.getSE(); 6153 6154 unsigned AS = getLoadStoreAddressSpace(I); 6155 Value *Ptr = getLoadStorePointerOperand(I); 6156 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6157 6158 // Figure out whether the access is strided and get the stride value 6159 // if it's known in compile time 6160 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6161 6162 // Get the cost of the scalar memory instruction and address computation. 6163 unsigned Cost = 6164 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6165 6166 // Don't pass *I here, since it is scalar but will actually be part of a 6167 // vectorized loop where the user of it is a vectorized instruction. 6168 const Align Alignment = getLoadStoreAlignment(I); 6169 Cost += VF.getKnownMinValue() * 6170 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6171 AS, TTI::TCK_RecipThroughput); 6172 6173 // Get the overhead of the extractelement and insertelement instructions 6174 // we might create due to scalarization. 6175 Cost += getScalarizationOverhead(I, VF); 6176 6177 // If we have a predicated store, it may not be executed for each vector 6178 // lane. Scale the cost by the probability of executing the predicated 6179 // block. 6180 if (isPredicatedInst(I)) { 6181 Cost /= getReciprocalPredBlockProb(); 6182 6183 if (useEmulatedMaskMemRefHack(I)) 6184 // Artificially setting to a high enough value to practically disable 6185 // vectorization with such operations. 6186 Cost = 3000000; 6187 } 6188 6189 return Cost; 6190 } 6191 6192 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6193 ElementCount VF) { 6194 Type *ValTy = getMemInstValueType(I); 6195 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6196 Value *Ptr = getLoadStorePointerOperand(I); 6197 unsigned AS = getLoadStoreAddressSpace(I); 6198 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6199 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6200 6201 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6202 "Stride should be 1 or -1 for consecutive memory access"); 6203 const Align Alignment = getLoadStoreAlignment(I); 6204 unsigned Cost = 0; 6205 if (Legal->isMaskRequired(I)) 6206 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6207 CostKind); 6208 else 6209 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6210 CostKind, I); 6211 6212 bool Reverse = ConsecutiveStride < 0; 6213 if (Reverse) 6214 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6215 return Cost; 6216 } 6217 6218 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6219 ElementCount VF) { 6220 Type *ValTy = getMemInstValueType(I); 6221 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6222 const Align Alignment = getLoadStoreAlignment(I); 6223 unsigned AS = getLoadStoreAddressSpace(I); 6224 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6225 if (isa<LoadInst>(I)) { 6226 return TTI.getAddressComputationCost(ValTy) + 6227 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6228 CostKind) + 6229 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6230 } 6231 StoreInst *SI = cast<StoreInst>(I); 6232 6233 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6234 return TTI.getAddressComputationCost(ValTy) + 6235 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6236 CostKind) + 6237 (isLoopInvariantStoreValue 6238 ? 0 6239 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6240 VF.getKnownMinValue() - 1)); 6241 } 6242 6243 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6244 ElementCount VF) { 6245 Type *ValTy = getMemInstValueType(I); 6246 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6247 const Align Alignment = getLoadStoreAlignment(I); 6248 const Value *Ptr = getLoadStorePointerOperand(I); 6249 6250 return TTI.getAddressComputationCost(VectorTy) + 6251 TTI.getGatherScatterOpCost( 6252 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6253 TargetTransformInfo::TCK_RecipThroughput, I); 6254 } 6255 6256 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6257 ElementCount VF) { 6258 Type *ValTy = getMemInstValueType(I); 6259 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6260 unsigned AS = getLoadStoreAddressSpace(I); 6261 6262 auto Group = getInterleavedAccessGroup(I); 6263 assert(Group && "Fail to get an interleaved access group."); 6264 6265 unsigned InterleaveFactor = Group->getFactor(); 6266 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6267 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6268 6269 // Holds the indices of existing members in an interleaved load group. 6270 // An interleaved store group doesn't need this as it doesn't allow gaps. 6271 SmallVector<unsigned, 4> Indices; 6272 if (isa<LoadInst>(I)) { 6273 for (unsigned i = 0; i < InterleaveFactor; i++) 6274 if (Group->getMember(i)) 6275 Indices.push_back(i); 6276 } 6277 6278 // Calculate the cost of the whole interleaved group. 6279 bool UseMaskForGaps = 6280 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6281 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6282 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6283 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6284 6285 if (Group->isReverse()) { 6286 // TODO: Add support for reversed masked interleaved access. 6287 assert(!Legal->isMaskRequired(I) && 6288 "Reverse masked interleaved access not supported."); 6289 Cost += Group->getNumMembers() * 6290 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6291 } 6292 return Cost; 6293 } 6294 6295 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6296 ElementCount VF) { 6297 // Calculate scalar cost only. Vectorization cost should be ready at this 6298 // moment. 6299 if (VF.isScalar()) { 6300 Type *ValTy = getMemInstValueType(I); 6301 const Align Alignment = getLoadStoreAlignment(I); 6302 unsigned AS = getLoadStoreAddressSpace(I); 6303 6304 return TTI.getAddressComputationCost(ValTy) + 6305 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6306 TTI::TCK_RecipThroughput, I); 6307 } 6308 return getWideningCost(I, VF); 6309 } 6310 6311 LoopVectorizationCostModel::VectorizationCostTy 6312 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6313 ElementCount VF) { 6314 assert(!VF.isScalable() && 6315 "the cost model is not yet implemented for scalable vectorization"); 6316 // If we know that this instruction will remain uniform, check the cost of 6317 // the scalar version. 6318 if (isUniformAfterVectorization(I, VF)) 6319 VF = ElementCount::getFixed(1); 6320 6321 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6322 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6323 6324 // Forced scalars do not have any scalarization overhead. 6325 auto ForcedScalar = ForcedScalars.find(VF); 6326 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6327 auto InstSet = ForcedScalar->second; 6328 if (InstSet.count(I)) 6329 return VectorizationCostTy( 6330 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6331 VF.getKnownMinValue()), 6332 false); 6333 } 6334 6335 Type *VectorTy; 6336 unsigned C = getInstructionCost(I, VF, VectorTy); 6337 6338 bool TypeNotScalarized = 6339 VF.isVector() && VectorTy->isVectorTy() && 6340 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6341 return VectorizationCostTy(C, TypeNotScalarized); 6342 } 6343 6344 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6345 ElementCount VF) { 6346 6347 assert(!VF.isScalable() && 6348 "cannot compute scalarization overhead for scalable vectorization"); 6349 if (VF.isScalar()) 6350 return 0; 6351 6352 unsigned Cost = 0; 6353 Type *RetTy = ToVectorTy(I->getType(), VF); 6354 if (!RetTy->isVoidTy() && 6355 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6356 Cost += TTI.getScalarizationOverhead( 6357 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6358 true, false); 6359 6360 // Some targets keep addresses scalar. 6361 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6362 return Cost; 6363 6364 // Some targets support efficient element stores. 6365 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6366 return Cost; 6367 6368 // Collect operands to consider. 6369 CallInst *CI = dyn_cast<CallInst>(I); 6370 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6371 6372 // Skip operands that do not require extraction/scalarization and do not incur 6373 // any overhead. 6374 return Cost + TTI.getOperandsScalarizationOverhead( 6375 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6376 } 6377 6378 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6379 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6380 if (VF.isScalar()) 6381 return; 6382 NumPredStores = 0; 6383 for (BasicBlock *BB : TheLoop->blocks()) { 6384 // For each instruction in the old loop. 6385 for (Instruction &I : *BB) { 6386 Value *Ptr = getLoadStorePointerOperand(&I); 6387 if (!Ptr) 6388 continue; 6389 6390 // TODO: We should generate better code and update the cost model for 6391 // predicated uniform stores. Today they are treated as any other 6392 // predicated store (see added test cases in 6393 // invariant-store-vectorization.ll). 6394 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6395 NumPredStores++; 6396 6397 if (Legal->isUniform(Ptr) && 6398 // Conditional loads and stores should be scalarized and predicated. 6399 // isScalarWithPredication cannot be used here since masked 6400 // gather/scatters are not considered scalar with predication. 6401 !Legal->blockNeedsPredication(I.getParent())) { 6402 // TODO: Avoid replicating loads and stores instead of 6403 // relying on instcombine to remove them. 6404 // Load: Scalar load + broadcast 6405 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6406 unsigned Cost = getUniformMemOpCost(&I, VF); 6407 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6408 continue; 6409 } 6410 6411 // We assume that widening is the best solution when possible. 6412 if (memoryInstructionCanBeWidened(&I, VF)) { 6413 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6414 int ConsecutiveStride = 6415 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6416 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6417 "Expected consecutive stride."); 6418 InstWidening Decision = 6419 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6420 setWideningDecision(&I, VF, Decision, Cost); 6421 continue; 6422 } 6423 6424 // Choose between Interleaving, Gather/Scatter or Scalarization. 6425 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6426 unsigned NumAccesses = 1; 6427 if (isAccessInterleaved(&I)) { 6428 auto Group = getInterleavedAccessGroup(&I); 6429 assert(Group && "Fail to get an interleaved access group."); 6430 6431 // Make one decision for the whole group. 6432 if (getWideningDecision(&I, VF) != CM_Unknown) 6433 continue; 6434 6435 NumAccesses = Group->getNumMembers(); 6436 if (interleavedAccessCanBeWidened(&I, VF)) 6437 InterleaveCost = getInterleaveGroupCost(&I, VF); 6438 } 6439 6440 unsigned GatherScatterCost = 6441 isLegalGatherOrScatter(&I) 6442 ? getGatherScatterCost(&I, VF) * NumAccesses 6443 : std::numeric_limits<unsigned>::max(); 6444 6445 unsigned ScalarizationCost = 6446 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6447 6448 // Choose better solution for the current VF, 6449 // write down this decision and use it during vectorization. 6450 unsigned Cost; 6451 InstWidening Decision; 6452 if (InterleaveCost <= GatherScatterCost && 6453 InterleaveCost < ScalarizationCost) { 6454 Decision = CM_Interleave; 6455 Cost = InterleaveCost; 6456 } else if (GatherScatterCost < ScalarizationCost) { 6457 Decision = CM_GatherScatter; 6458 Cost = GatherScatterCost; 6459 } else { 6460 Decision = CM_Scalarize; 6461 Cost = ScalarizationCost; 6462 } 6463 // If the instructions belongs to an interleave group, the whole group 6464 // receives the same decision. The whole group receives the cost, but 6465 // the cost will actually be assigned to one instruction. 6466 if (auto Group = getInterleavedAccessGroup(&I)) 6467 setWideningDecision(Group, VF, Decision, Cost); 6468 else 6469 setWideningDecision(&I, VF, Decision, Cost); 6470 } 6471 } 6472 6473 // Make sure that any load of address and any other address computation 6474 // remains scalar unless there is gather/scatter support. This avoids 6475 // inevitable extracts into address registers, and also has the benefit of 6476 // activating LSR more, since that pass can't optimize vectorized 6477 // addresses. 6478 if (TTI.prefersVectorizedAddressing()) 6479 return; 6480 6481 // Start with all scalar pointer uses. 6482 SmallPtrSet<Instruction *, 8> AddrDefs; 6483 for (BasicBlock *BB : TheLoop->blocks()) 6484 for (Instruction &I : *BB) { 6485 Instruction *PtrDef = 6486 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6487 if (PtrDef && TheLoop->contains(PtrDef) && 6488 getWideningDecision(&I, VF) != CM_GatherScatter) 6489 AddrDefs.insert(PtrDef); 6490 } 6491 6492 // Add all instructions used to generate the addresses. 6493 SmallVector<Instruction *, 4> Worklist; 6494 for (auto *I : AddrDefs) 6495 Worklist.push_back(I); 6496 while (!Worklist.empty()) { 6497 Instruction *I = Worklist.pop_back_val(); 6498 for (auto &Op : I->operands()) 6499 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6500 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6501 AddrDefs.insert(InstOp).second) 6502 Worklist.push_back(InstOp); 6503 } 6504 6505 for (auto *I : AddrDefs) { 6506 if (isa<LoadInst>(I)) { 6507 // Setting the desired widening decision should ideally be handled in 6508 // by cost functions, but since this involves the task of finding out 6509 // if the loaded register is involved in an address computation, it is 6510 // instead changed here when we know this is the case. 6511 InstWidening Decision = getWideningDecision(I, VF); 6512 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6513 // Scalarize a widened load of address. 6514 setWideningDecision( 6515 I, VF, CM_Scalarize, 6516 (VF.getKnownMinValue() * 6517 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6518 else if (auto Group = getInterleavedAccessGroup(I)) { 6519 // Scalarize an interleave group of address loads. 6520 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6521 if (Instruction *Member = Group->getMember(I)) 6522 setWideningDecision( 6523 Member, VF, CM_Scalarize, 6524 (VF.getKnownMinValue() * 6525 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6526 } 6527 } 6528 } else 6529 // Make sure I gets scalarized and a cost estimate without 6530 // scalarization overhead. 6531 ForcedScalars[VF].insert(I); 6532 } 6533 } 6534 6535 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6536 ElementCount VF, 6537 Type *&VectorTy) { 6538 Type *RetTy = I->getType(); 6539 if (canTruncateToMinimalBitwidth(I, VF)) 6540 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6541 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6542 auto SE = PSE.getSE(); 6543 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6544 6545 // TODO: We need to estimate the cost of intrinsic calls. 6546 switch (I->getOpcode()) { 6547 case Instruction::GetElementPtr: 6548 // We mark this instruction as zero-cost because the cost of GEPs in 6549 // vectorized code depends on whether the corresponding memory instruction 6550 // is scalarized or not. Therefore, we handle GEPs with the memory 6551 // instruction cost. 6552 return 0; 6553 case Instruction::Br: { 6554 // In cases of scalarized and predicated instructions, there will be VF 6555 // predicated blocks in the vectorized loop. Each branch around these 6556 // blocks requires also an extract of its vector compare i1 element. 6557 bool ScalarPredicatedBB = false; 6558 BranchInst *BI = cast<BranchInst>(I); 6559 if (VF.isVector() && BI->isConditional() && 6560 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6561 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6562 ScalarPredicatedBB = true; 6563 6564 if (ScalarPredicatedBB) { 6565 // Return cost for branches around scalarized and predicated blocks. 6566 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6567 auto *Vec_i1Ty = 6568 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6569 return (TTI.getScalarizationOverhead( 6570 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6571 false, true) + 6572 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6573 VF.getKnownMinValue())); 6574 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6575 // The back-edge branch will remain, as will all scalar branches. 6576 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6577 else 6578 // This branch will be eliminated by if-conversion. 6579 return 0; 6580 // Note: We currently assume zero cost for an unconditional branch inside 6581 // a predicated block since it will become a fall-through, although we 6582 // may decide in the future to call TTI for all branches. 6583 } 6584 case Instruction::PHI: { 6585 auto *Phi = cast<PHINode>(I); 6586 6587 // First-order recurrences are replaced by vector shuffles inside the loop. 6588 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6589 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6590 return TTI.getShuffleCost( 6591 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6592 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6593 6594 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6595 // converted into select instructions. We require N - 1 selects per phi 6596 // node, where N is the number of incoming values. 6597 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6598 return (Phi->getNumIncomingValues() - 1) * 6599 TTI.getCmpSelInstrCost( 6600 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6601 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6602 CostKind); 6603 6604 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6605 } 6606 case Instruction::UDiv: 6607 case Instruction::SDiv: 6608 case Instruction::URem: 6609 case Instruction::SRem: 6610 // If we have a predicated instruction, it may not be executed for each 6611 // vector lane. Get the scalarization cost and scale this amount by the 6612 // probability of executing the predicated block. If the instruction is not 6613 // predicated, we fall through to the next case. 6614 if (VF.isVector() && isScalarWithPredication(I)) { 6615 unsigned Cost = 0; 6616 6617 // These instructions have a non-void type, so account for the phi nodes 6618 // that we will create. This cost is likely to be zero. The phi node 6619 // cost, if any, should be scaled by the block probability because it 6620 // models a copy at the end of each predicated block. 6621 Cost += VF.getKnownMinValue() * 6622 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6623 6624 // The cost of the non-predicated instruction. 6625 Cost += VF.getKnownMinValue() * 6626 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6627 6628 // The cost of insertelement and extractelement instructions needed for 6629 // scalarization. 6630 Cost += getScalarizationOverhead(I, VF); 6631 6632 // Scale the cost by the probability of executing the predicated blocks. 6633 // This assumes the predicated block for each vector lane is equally 6634 // likely. 6635 return Cost / getReciprocalPredBlockProb(); 6636 } 6637 LLVM_FALLTHROUGH; 6638 case Instruction::Add: 6639 case Instruction::FAdd: 6640 case Instruction::Sub: 6641 case Instruction::FSub: 6642 case Instruction::Mul: 6643 case Instruction::FMul: 6644 case Instruction::FDiv: 6645 case Instruction::FRem: 6646 case Instruction::Shl: 6647 case Instruction::LShr: 6648 case Instruction::AShr: 6649 case Instruction::And: 6650 case Instruction::Or: 6651 case Instruction::Xor: { 6652 // Since we will replace the stride by 1 the multiplication should go away. 6653 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6654 return 0; 6655 // Certain instructions can be cheaper to vectorize if they have a constant 6656 // second vector operand. One example of this are shifts on x86. 6657 Value *Op2 = I->getOperand(1); 6658 TargetTransformInfo::OperandValueProperties Op2VP; 6659 TargetTransformInfo::OperandValueKind Op2VK = 6660 TTI.getOperandInfo(Op2, Op2VP); 6661 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6662 Op2VK = TargetTransformInfo::OK_UniformValue; 6663 6664 SmallVector<const Value *, 4> Operands(I->operand_values()); 6665 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6666 return N * TTI.getArithmeticInstrCost( 6667 I->getOpcode(), VectorTy, CostKind, 6668 TargetTransformInfo::OK_AnyValue, 6669 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6670 } 6671 case Instruction::FNeg: { 6672 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6673 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6674 return N * TTI.getArithmeticInstrCost( 6675 I->getOpcode(), VectorTy, CostKind, 6676 TargetTransformInfo::OK_AnyValue, 6677 TargetTransformInfo::OK_AnyValue, 6678 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6679 I->getOperand(0), I); 6680 } 6681 case Instruction::Select: { 6682 SelectInst *SI = cast<SelectInst>(I); 6683 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6684 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6685 Type *CondTy = SI->getCondition()->getType(); 6686 if (!ScalarCond) { 6687 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6688 CondTy = VectorType::get(CondTy, VF); 6689 } 6690 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6691 CostKind, I); 6692 } 6693 case Instruction::ICmp: 6694 case Instruction::FCmp: { 6695 Type *ValTy = I->getOperand(0)->getType(); 6696 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6697 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6698 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6699 VectorTy = ToVectorTy(ValTy, VF); 6700 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6701 I); 6702 } 6703 case Instruction::Store: 6704 case Instruction::Load: { 6705 ElementCount Width = VF; 6706 if (Width.isVector()) { 6707 InstWidening Decision = getWideningDecision(I, Width); 6708 assert(Decision != CM_Unknown && 6709 "CM decision should be taken at this point"); 6710 if (Decision == CM_Scalarize) 6711 Width = ElementCount::getFixed(1); 6712 } 6713 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6714 return getMemoryInstructionCost(I, VF); 6715 } 6716 case Instruction::ZExt: 6717 case Instruction::SExt: 6718 case Instruction::FPToUI: 6719 case Instruction::FPToSI: 6720 case Instruction::FPExt: 6721 case Instruction::PtrToInt: 6722 case Instruction::IntToPtr: 6723 case Instruction::SIToFP: 6724 case Instruction::UIToFP: 6725 case Instruction::Trunc: 6726 case Instruction::FPTrunc: 6727 case Instruction::BitCast: { 6728 // Computes the CastContextHint from a Load/Store instruction. 6729 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6730 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6731 "Expected a load or a store!"); 6732 6733 if (VF.isScalar() || !TheLoop->contains(I)) 6734 return TTI::CastContextHint::Normal; 6735 6736 switch (getWideningDecision(I, VF)) { 6737 case LoopVectorizationCostModel::CM_GatherScatter: 6738 return TTI::CastContextHint::GatherScatter; 6739 case LoopVectorizationCostModel::CM_Interleave: 6740 return TTI::CastContextHint::Interleave; 6741 case LoopVectorizationCostModel::CM_Scalarize: 6742 case LoopVectorizationCostModel::CM_Widen: 6743 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6744 : TTI::CastContextHint::Normal; 6745 case LoopVectorizationCostModel::CM_Widen_Reverse: 6746 return TTI::CastContextHint::Reversed; 6747 case LoopVectorizationCostModel::CM_Unknown: 6748 llvm_unreachable("Instr did not go through cost modelling?"); 6749 } 6750 6751 llvm_unreachable("Unhandled case!"); 6752 }; 6753 6754 unsigned Opcode = I->getOpcode(); 6755 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6756 // For Trunc, the context is the only user, which must be a StoreInst. 6757 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6758 if (I->hasOneUse()) 6759 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6760 CCH = ComputeCCH(Store); 6761 } 6762 // For Z/Sext, the context is the operand, which must be a LoadInst. 6763 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6764 Opcode == Instruction::FPExt) { 6765 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6766 CCH = ComputeCCH(Load); 6767 } 6768 6769 // We optimize the truncation of induction variables having constant 6770 // integer steps. The cost of these truncations is the same as the scalar 6771 // operation. 6772 if (isOptimizableIVTruncate(I, VF)) { 6773 auto *Trunc = cast<TruncInst>(I); 6774 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6775 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6776 } 6777 6778 Type *SrcScalarTy = I->getOperand(0)->getType(); 6779 Type *SrcVecTy = 6780 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6781 if (canTruncateToMinimalBitwidth(I, VF)) { 6782 // This cast is going to be shrunk. This may remove the cast or it might 6783 // turn it into slightly different cast. For example, if MinBW == 16, 6784 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6785 // 6786 // Calculate the modified src and dest types. 6787 Type *MinVecTy = VectorTy; 6788 if (Opcode == Instruction::Trunc) { 6789 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6790 VectorTy = 6791 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6792 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6793 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6794 VectorTy = 6795 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6796 } 6797 } 6798 6799 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6800 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6801 return N * 6802 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6803 } 6804 case Instruction::Call: { 6805 bool NeedToScalarize; 6806 CallInst *CI = cast<CallInst>(I); 6807 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6808 if (getVectorIntrinsicIDForCall(CI, TLI)) 6809 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6810 return CallCost; 6811 } 6812 default: 6813 // The cost of executing VF copies of the scalar instruction. This opcode 6814 // is unknown. Assume that it is the same as 'mul'. 6815 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6816 Instruction::Mul, VectorTy, CostKind) + 6817 getScalarizationOverhead(I, VF); 6818 } // end of switch. 6819 } 6820 6821 char LoopVectorize::ID = 0; 6822 6823 static const char lv_name[] = "Loop Vectorization"; 6824 6825 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6826 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6827 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6828 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6829 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6830 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6831 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6832 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6833 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6834 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6835 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6836 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6837 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6838 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6839 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6840 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6841 6842 namespace llvm { 6843 6844 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6845 6846 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6847 bool VectorizeOnlyWhenForced) { 6848 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6849 } 6850 6851 } // end namespace llvm 6852 6853 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6854 // Check if the pointer operand of a load or store instruction is 6855 // consecutive. 6856 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6857 return Legal->isConsecutivePtr(Ptr); 6858 return false; 6859 } 6860 6861 void LoopVectorizationCostModel::collectValuesToIgnore() { 6862 // Ignore ephemeral values. 6863 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6864 6865 // Ignore type-promoting instructions we identified during reduction 6866 // detection. 6867 for (auto &Reduction : Legal->getReductionVars()) { 6868 RecurrenceDescriptor &RedDes = Reduction.second; 6869 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6870 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6871 } 6872 // Ignore type-casting instructions we identified during induction 6873 // detection. 6874 for (auto &Induction : Legal->getInductionVars()) { 6875 InductionDescriptor &IndDes = Induction.second; 6876 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6877 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6878 } 6879 } 6880 6881 void LoopVectorizationCostModel::collectInLoopReductions() { 6882 for (auto &Reduction : Legal->getReductionVars()) { 6883 PHINode *Phi = Reduction.first; 6884 RecurrenceDescriptor &RdxDesc = Reduction.second; 6885 6886 // We don't collect reductions that are type promoted (yet). 6887 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6888 continue; 6889 6890 // If the target would prefer this reduction to happen "in-loop", then we 6891 // want to record it as such. 6892 unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()); 6893 if (!PreferInLoopReductions && 6894 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 6895 TargetTransformInfo::ReductionFlags())) 6896 continue; 6897 6898 // Check that we can correctly put the reductions into the loop, by 6899 // finding the chain of operations that leads from the phi to the loop 6900 // exit value. 6901 SmallVector<Instruction *, 4> ReductionOperations = 6902 RdxDesc.getReductionOpChain(Phi, TheLoop); 6903 bool InLoop = !ReductionOperations.empty(); 6904 if (InLoop) 6905 InLoopReductionChains[Phi] = ReductionOperations; 6906 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6907 << " reduction for phi: " << *Phi << "\n"); 6908 } 6909 } 6910 6911 // TODO: we could return a pair of values that specify the max VF and 6912 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6913 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6914 // doesn't have a cost model that can choose which plan to execute if 6915 // more than one is generated. 6916 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6917 LoopVectorizationCostModel &CM) { 6918 unsigned WidestType; 6919 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6920 return WidestVectorRegBits / WidestType; 6921 } 6922 6923 VectorizationFactor 6924 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6925 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6926 ElementCount VF = UserVF; 6927 // Outer loop handling: They may require CFG and instruction level 6928 // transformations before even evaluating whether vectorization is profitable. 6929 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6930 // the vectorization pipeline. 6931 if (!OrigLoop->isInnermost()) { 6932 // If the user doesn't provide a vectorization factor, determine a 6933 // reasonable one. 6934 if (UserVF.isZero()) { 6935 VF = ElementCount::getFixed( 6936 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6937 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6938 6939 // Make sure we have a VF > 1 for stress testing. 6940 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6941 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6942 << "overriding computed VF.\n"); 6943 VF = ElementCount::getFixed(4); 6944 } 6945 } 6946 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6947 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6948 "VF needs to be a power of two"); 6949 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6950 << "VF " << VF << " to build VPlans.\n"); 6951 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6952 6953 // For VPlan build stress testing, we bail out after VPlan construction. 6954 if (VPlanBuildStressTest) 6955 return VectorizationFactor::Disabled(); 6956 6957 return {VF, 0 /*Cost*/}; 6958 } 6959 6960 LLVM_DEBUG( 6961 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6962 "VPlan-native path.\n"); 6963 return VectorizationFactor::Disabled(); 6964 } 6965 6966 Optional<VectorizationFactor> 6967 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6968 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6969 assert(OrigLoop->isInnermost() && "Inner loop expected."); 6970 Optional<unsigned> MaybeMaxVF = 6971 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6972 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6973 return None; 6974 6975 // Invalidate interleave groups if all blocks of loop will be predicated. 6976 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6977 !useMaskedInterleavedAccesses(*TTI)) { 6978 LLVM_DEBUG( 6979 dbgs() 6980 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6981 "which requires masked-interleaved support.\n"); 6982 if (CM.InterleaveInfo.invalidateGroups()) 6983 // Invalidating interleave groups also requires invalidating all decisions 6984 // based on them, which includes widening decisions and uniform and scalar 6985 // values. 6986 CM.invalidateCostModelingDecisions(); 6987 } 6988 6989 if (!UserVF.isZero()) { 6990 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6991 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6992 "VF needs to be a power of two"); 6993 // Collect the instructions (and their associated costs) that will be more 6994 // profitable to scalarize. 6995 CM.selectUserVectorizationFactor(UserVF); 6996 CM.collectInLoopReductions(); 6997 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 6998 UserVF.getKnownMinValue()); 6999 LLVM_DEBUG(printPlans(dbgs())); 7000 return {{UserVF, 0}}; 7001 } 7002 7003 unsigned MaxVF = MaybeMaxVF.getValue(); 7004 assert(MaxVF != 0 && "MaxVF is zero."); 7005 7006 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 7007 // Collect Uniform and Scalar instructions after vectorization with VF. 7008 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 7009 7010 // Collect the instructions (and their associated costs) that will be more 7011 // profitable to scalarize. 7012 if (VF > 1) 7013 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 7014 } 7015 7016 CM.collectInLoopReductions(); 7017 7018 buildVPlansWithVPRecipes(1, MaxVF); 7019 LLVM_DEBUG(printPlans(dbgs())); 7020 if (MaxVF == 1) 7021 return VectorizationFactor::Disabled(); 7022 7023 // Select the optimal vectorization factor. 7024 return CM.selectVectorizationFactor(MaxVF); 7025 } 7026 7027 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7028 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7029 << '\n'); 7030 BestVF = VF; 7031 BestUF = UF; 7032 7033 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7034 return !Plan->hasVF(VF); 7035 }); 7036 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7037 } 7038 7039 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7040 DominatorTree *DT) { 7041 // Perform the actual loop transformation. 7042 7043 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7044 VPCallbackILV CallbackILV(ILV); 7045 7046 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7047 7048 VPTransformState State{*BestVF, BestUF, LI, 7049 DT, ILV.Builder, ILV.VectorLoopValueMap, 7050 &ILV, CallbackILV}; 7051 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7052 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7053 State.CanonicalIV = ILV.Induction; 7054 7055 //===------------------------------------------------===// 7056 // 7057 // Notice: any optimization or new instruction that go 7058 // into the code below should also be implemented in 7059 // the cost-model. 7060 // 7061 //===------------------------------------------------===// 7062 7063 // 2. Copy and widen instructions from the old loop into the new loop. 7064 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7065 VPlans.front()->execute(&State); 7066 7067 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7068 // predication, updating analyses. 7069 ILV.fixVectorizedLoop(); 7070 } 7071 7072 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7073 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7074 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7075 7076 // We create new control-flow for the vectorized loop, so the original 7077 // condition will be dead after vectorization if it's only used by the 7078 // branch. 7079 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7080 if (Cmp && Cmp->hasOneUse()) { 7081 DeadInstructions.insert(Cmp); 7082 7083 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7084 for (Value *Op : Cmp->operands()) { 7085 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7086 DeadInstructions.insert(cast<Instruction>(Op)); 7087 } 7088 } 7089 7090 // We create new "steps" for induction variable updates to which the original 7091 // induction variables map. An original update instruction will be dead if 7092 // all its users except the induction variable are dead. 7093 for (auto &Induction : Legal->getInductionVars()) { 7094 PHINode *Ind = Induction.first; 7095 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7096 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7097 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7098 })) 7099 DeadInstructions.insert(IndUpdate); 7100 7101 // We record as "Dead" also the type-casting instructions we had identified 7102 // during induction analysis. We don't need any handling for them in the 7103 // vectorized loop because we have proven that, under a proper runtime 7104 // test guarding the vectorized loop, the value of the phi, and the casted 7105 // value of the phi, are the same. The last instruction in this casting chain 7106 // will get its scalar/vector/widened def from the scalar/vector/widened def 7107 // of the respective phi node. Any other casts in the induction def-use chain 7108 // have no other uses outside the phi update chain, and will be ignored. 7109 InductionDescriptor &IndDes = Induction.second; 7110 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7111 DeadInstructions.insert(Casts.begin(), Casts.end()); 7112 } 7113 } 7114 7115 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7116 7117 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7118 7119 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7120 Instruction::BinaryOps BinOp) { 7121 // When unrolling and the VF is 1, we only need to add a simple scalar. 7122 Type *Ty = Val->getType(); 7123 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7124 7125 if (Ty->isFloatingPointTy()) { 7126 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7127 7128 // Floating point operations had to be 'fast' to enable the unrolling. 7129 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7130 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7131 } 7132 Constant *C = ConstantInt::get(Ty, StartIdx); 7133 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7134 } 7135 7136 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7137 SmallVector<Metadata *, 4> MDs; 7138 // Reserve first location for self reference to the LoopID metadata node. 7139 MDs.push_back(nullptr); 7140 bool IsUnrollMetadata = false; 7141 MDNode *LoopID = L->getLoopID(); 7142 if (LoopID) { 7143 // First find existing loop unrolling disable metadata. 7144 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7145 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7146 if (MD) { 7147 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7148 IsUnrollMetadata = 7149 S && S->getString().startswith("llvm.loop.unroll.disable"); 7150 } 7151 MDs.push_back(LoopID->getOperand(i)); 7152 } 7153 } 7154 7155 if (!IsUnrollMetadata) { 7156 // Add runtime unroll disable metadata. 7157 LLVMContext &Context = L->getHeader()->getContext(); 7158 SmallVector<Metadata *, 1> DisableOperands; 7159 DisableOperands.push_back( 7160 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7161 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7162 MDs.push_back(DisableNode); 7163 MDNode *NewLoopID = MDNode::get(Context, MDs); 7164 // Set operand 0 to refer to the loop id itself. 7165 NewLoopID->replaceOperandWith(0, NewLoopID); 7166 L->setLoopID(NewLoopID); 7167 } 7168 } 7169 7170 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7171 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7172 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7173 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7174 7175 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7176 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7177 Range.End = TmpVF; 7178 break; 7179 } 7180 7181 return PredicateAtRangeStart; 7182 } 7183 7184 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7185 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7186 /// of VF's starting at a given VF and extending it as much as possible. Each 7187 /// vectorization decision can potentially shorten this sub-range during 7188 /// buildVPlan(). 7189 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7190 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7191 VFRange SubRange = {VF, MaxVF + 1}; 7192 VPlans.push_back(buildVPlan(SubRange)); 7193 VF = SubRange.End; 7194 } 7195 } 7196 7197 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7198 VPlanPtr &Plan) { 7199 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7200 7201 // Look for cached value. 7202 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7203 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7204 if (ECEntryIt != EdgeMaskCache.end()) 7205 return ECEntryIt->second; 7206 7207 VPValue *SrcMask = createBlockInMask(Src, Plan); 7208 7209 // The terminator has to be a branch inst! 7210 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7211 assert(BI && "Unexpected terminator found"); 7212 7213 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7214 return EdgeMaskCache[Edge] = SrcMask; 7215 7216 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7217 assert(EdgeMask && "No Edge Mask found for condition"); 7218 7219 if (BI->getSuccessor(0) != Dst) 7220 EdgeMask = Builder.createNot(EdgeMask); 7221 7222 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7223 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7224 7225 return EdgeMaskCache[Edge] = EdgeMask; 7226 } 7227 7228 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7229 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7230 7231 // Look for cached value. 7232 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7233 if (BCEntryIt != BlockMaskCache.end()) 7234 return BCEntryIt->second; 7235 7236 // All-one mask is modelled as no-mask following the convention for masked 7237 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7238 VPValue *BlockMask = nullptr; 7239 7240 if (OrigLoop->getHeader() == BB) { 7241 if (!CM.blockNeedsPredication(BB)) 7242 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7243 7244 // Create the block in mask as the first non-phi instruction in the block. 7245 VPBuilder::InsertPointGuard Guard(Builder); 7246 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7247 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7248 7249 // Introduce the early-exit compare IV <= BTC to form header block mask. 7250 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7251 // Start by constructing the desired canonical IV. 7252 VPValue *IV = nullptr; 7253 if (Legal->getPrimaryInduction()) 7254 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7255 else { 7256 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7257 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7258 IV = IVRecipe->getVPValue(); 7259 } 7260 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7261 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7262 7263 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7264 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7265 // as a second argument, we only pass the IV here and extract the 7266 // tripcount from the transform state where codegen of the VP instructions 7267 // happen. 7268 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7269 } else { 7270 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7271 } 7272 return BlockMaskCache[BB] = BlockMask; 7273 } 7274 7275 // This is the block mask. We OR all incoming edges. 7276 for (auto *Predecessor : predecessors(BB)) { 7277 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7278 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7279 return BlockMaskCache[BB] = EdgeMask; 7280 7281 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7282 BlockMask = EdgeMask; 7283 continue; 7284 } 7285 7286 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7287 } 7288 7289 return BlockMaskCache[BB] = BlockMask; 7290 } 7291 7292 VPWidenMemoryInstructionRecipe * 7293 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7294 VPlanPtr &Plan) { 7295 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7296 "Must be called with either a load or store"); 7297 7298 auto willWiden = [&](ElementCount VF) -> bool { 7299 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7300 if (VF.isScalar()) 7301 return false; 7302 LoopVectorizationCostModel::InstWidening Decision = 7303 CM.getWideningDecision(I, VF); 7304 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7305 "CM decision should be taken at this point."); 7306 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7307 return true; 7308 if (CM.isScalarAfterVectorization(I, VF) || 7309 CM.isProfitableToScalarize(I, VF)) 7310 return false; 7311 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7312 }; 7313 7314 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7315 return nullptr; 7316 7317 VPValue *Mask = nullptr; 7318 if (Legal->isMaskRequired(I)) 7319 Mask = createBlockInMask(I->getParent(), Plan); 7320 7321 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7322 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7323 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7324 7325 StoreInst *Store = cast<StoreInst>(I); 7326 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7327 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7328 } 7329 7330 VPWidenIntOrFpInductionRecipe * 7331 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7332 // Check if this is an integer or fp induction. If so, build the recipe that 7333 // produces its scalar and vector values. 7334 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7335 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7336 II.getKind() == InductionDescriptor::IK_FpInduction) 7337 return new VPWidenIntOrFpInductionRecipe(Phi); 7338 7339 return nullptr; 7340 } 7341 7342 VPWidenIntOrFpInductionRecipe * 7343 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7344 VFRange &Range) const { 7345 // Optimize the special case where the source is a constant integer 7346 // induction variable. Notice that we can only optimize the 'trunc' case 7347 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7348 // (c) other casts depend on pointer size. 7349 7350 // Determine whether \p K is a truncation based on an induction variable that 7351 // can be optimized. 7352 auto isOptimizableIVTruncate = 7353 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7354 return [=](ElementCount VF) -> bool { 7355 return CM.isOptimizableIVTruncate(K, VF); 7356 }; 7357 }; 7358 7359 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7360 isOptimizableIVTruncate(I), Range)) 7361 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7362 I); 7363 return nullptr; 7364 } 7365 7366 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7367 // We know that all PHIs in non-header blocks are converted into selects, so 7368 // we don't have to worry about the insertion order and we can just use the 7369 // builder. At this point we generate the predication tree. There may be 7370 // duplications since this is a simple recursive scan, but future 7371 // optimizations will clean it up. 7372 7373 SmallVector<VPValue *, 2> Operands; 7374 unsigned NumIncoming = Phi->getNumIncomingValues(); 7375 for (unsigned In = 0; In < NumIncoming; In++) { 7376 VPValue *EdgeMask = 7377 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7378 assert((EdgeMask || NumIncoming == 1) && 7379 "Multiple predecessors with one having a full mask"); 7380 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7381 if (EdgeMask) 7382 Operands.push_back(EdgeMask); 7383 } 7384 return new VPBlendRecipe(Phi, Operands); 7385 } 7386 7387 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7388 VPlan &Plan) const { 7389 7390 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7391 [this, CI](ElementCount VF) { 7392 return CM.isScalarWithPredication(CI, VF); 7393 }, 7394 Range); 7395 7396 if (IsPredicated) 7397 return nullptr; 7398 7399 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7400 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7401 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7402 return nullptr; 7403 7404 auto willWiden = [&](ElementCount VF) -> bool { 7405 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7406 // The following case may be scalarized depending on the VF. 7407 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7408 // version of the instruction. 7409 // Is it beneficial to perform intrinsic call compared to lib call? 7410 bool NeedToScalarize = false; 7411 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7412 bool UseVectorIntrinsic = 7413 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7414 return UseVectorIntrinsic || !NeedToScalarize; 7415 }; 7416 7417 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7418 return nullptr; 7419 7420 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7421 } 7422 7423 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7424 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7425 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7426 // Instruction should be widened, unless it is scalar after vectorization, 7427 // scalarization is profitable or it is predicated. 7428 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7429 return CM.isScalarAfterVectorization(I, VF) || 7430 CM.isProfitableToScalarize(I, VF) || 7431 CM.isScalarWithPredication(I, VF); 7432 }; 7433 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7434 Range); 7435 } 7436 7437 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7438 auto IsVectorizableOpcode = [](unsigned Opcode) { 7439 switch (Opcode) { 7440 case Instruction::Add: 7441 case Instruction::And: 7442 case Instruction::AShr: 7443 case Instruction::BitCast: 7444 case Instruction::FAdd: 7445 case Instruction::FCmp: 7446 case Instruction::FDiv: 7447 case Instruction::FMul: 7448 case Instruction::FNeg: 7449 case Instruction::FPExt: 7450 case Instruction::FPToSI: 7451 case Instruction::FPToUI: 7452 case Instruction::FPTrunc: 7453 case Instruction::FRem: 7454 case Instruction::FSub: 7455 case Instruction::ICmp: 7456 case Instruction::IntToPtr: 7457 case Instruction::LShr: 7458 case Instruction::Mul: 7459 case Instruction::Or: 7460 case Instruction::PtrToInt: 7461 case Instruction::SDiv: 7462 case Instruction::Select: 7463 case Instruction::SExt: 7464 case Instruction::Shl: 7465 case Instruction::SIToFP: 7466 case Instruction::SRem: 7467 case Instruction::Sub: 7468 case Instruction::Trunc: 7469 case Instruction::UDiv: 7470 case Instruction::UIToFP: 7471 case Instruction::URem: 7472 case Instruction::Xor: 7473 case Instruction::ZExt: 7474 return true; 7475 } 7476 return false; 7477 }; 7478 7479 if (!IsVectorizableOpcode(I->getOpcode())) 7480 return nullptr; 7481 7482 // Success: widen this instruction. 7483 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7484 } 7485 7486 VPBasicBlock *VPRecipeBuilder::handleReplication( 7487 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7488 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7489 VPlanPtr &Plan) { 7490 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7491 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7492 Range); 7493 7494 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7495 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7496 Range); 7497 7498 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7499 IsUniform, IsPredicated); 7500 setRecipe(I, Recipe); 7501 7502 // Find if I uses a predicated instruction. If so, it will use its scalar 7503 // value. Avoid hoisting the insert-element which packs the scalar value into 7504 // a vector value, as that happens iff all users use the vector value. 7505 for (auto &Op : I->operands()) 7506 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7507 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7508 PredInst2Recipe[PredInst]->setAlsoPack(false); 7509 7510 // Finalize the recipe for Instr, first if it is not predicated. 7511 if (!IsPredicated) { 7512 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7513 VPBB->appendRecipe(Recipe); 7514 return VPBB; 7515 } 7516 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7517 assert(VPBB->getSuccessors().empty() && 7518 "VPBB has successors when handling predicated replication."); 7519 // Record predicated instructions for above packing optimizations. 7520 PredInst2Recipe[I] = Recipe; 7521 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7522 VPBlockUtils::insertBlockAfter(Region, VPBB); 7523 auto *RegSucc = new VPBasicBlock(); 7524 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7525 return RegSucc; 7526 } 7527 7528 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7529 VPRecipeBase *PredRecipe, 7530 VPlanPtr &Plan) { 7531 // Instructions marked for predication are replicated and placed under an 7532 // if-then construct to prevent side-effects. 7533 7534 // Generate recipes to compute the block mask for this region. 7535 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7536 7537 // Build the triangular if-then region. 7538 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7539 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7540 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7541 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7542 auto *PHIRecipe = 7543 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7544 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7545 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7546 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7547 7548 // Note: first set Entry as region entry and then connect successors starting 7549 // from it in order, to propagate the "parent" of each VPBasicBlock. 7550 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7551 VPBlockUtils::connectBlocks(Pred, Exit); 7552 7553 return Region; 7554 } 7555 7556 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7557 VFRange &Range, 7558 VPlanPtr &Plan) { 7559 // First, check for specific widening recipes that deal with calls, memory 7560 // operations, inductions and Phi nodes. 7561 if (auto *CI = dyn_cast<CallInst>(Instr)) 7562 return tryToWidenCall(CI, Range, *Plan); 7563 7564 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7565 return tryToWidenMemory(Instr, Range, Plan); 7566 7567 VPRecipeBase *Recipe; 7568 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7569 if (Phi->getParent() != OrigLoop->getHeader()) 7570 return tryToBlend(Phi, Plan); 7571 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7572 return Recipe; 7573 return new VPWidenPHIRecipe(Phi); 7574 } 7575 7576 if (isa<TruncInst>(Instr) && 7577 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7578 return Recipe; 7579 7580 if (!shouldWiden(Instr, Range)) 7581 return nullptr; 7582 7583 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7584 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7585 OrigLoop); 7586 7587 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7588 bool InvariantCond = 7589 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7590 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7591 InvariantCond); 7592 } 7593 7594 return tryToWiden(Instr, *Plan); 7595 } 7596 7597 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7598 unsigned MaxVF) { 7599 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7600 7601 // Collect conditions feeding internal conditional branches; they need to be 7602 // represented in VPlan for it to model masking. 7603 SmallPtrSet<Value *, 1> NeedDef; 7604 7605 auto *Latch = OrigLoop->getLoopLatch(); 7606 for (BasicBlock *BB : OrigLoop->blocks()) { 7607 if (BB == Latch) 7608 continue; 7609 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7610 if (Branch && Branch->isConditional()) 7611 NeedDef.insert(Branch->getCondition()); 7612 } 7613 7614 // If the tail is to be folded by masking, the primary induction variable, if 7615 // exists needs to be represented in VPlan for it to model early-exit masking. 7616 // Also, both the Phi and the live-out instruction of each reduction are 7617 // required in order to introduce a select between them in VPlan. 7618 if (CM.foldTailByMasking()) { 7619 if (Legal->getPrimaryInduction()) 7620 NeedDef.insert(Legal->getPrimaryInduction()); 7621 for (auto &Reduction : Legal->getReductionVars()) { 7622 NeedDef.insert(Reduction.first); 7623 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7624 } 7625 } 7626 7627 // Collect instructions from the original loop that will become trivially dead 7628 // in the vectorized loop. We don't need to vectorize these instructions. For 7629 // example, original induction update instructions can become dead because we 7630 // separately emit induction "steps" when generating code for the new loop. 7631 // Similarly, we create a new latch condition when setting up the structure 7632 // of the new loop, so the old one can become dead. 7633 SmallPtrSet<Instruction *, 4> DeadInstructions; 7634 collectTriviallyDeadInstructions(DeadInstructions); 7635 7636 // Add assume instructions we need to drop to DeadInstructions, to prevent 7637 // them from being added to the VPlan. 7638 // TODO: We only need to drop assumes in blocks that get flattend. If the 7639 // control flow is preserved, we should keep them. 7640 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7641 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7642 7643 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7644 // Dead instructions do not need sinking. Remove them from SinkAfter. 7645 for (Instruction *I : DeadInstructions) 7646 SinkAfter.erase(I); 7647 7648 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7649 VFRange SubRange = {VF, MaxVF + 1}; 7650 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7651 DeadInstructions, SinkAfter)); 7652 VF = SubRange.End; 7653 } 7654 } 7655 7656 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7657 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7658 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7659 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7660 7661 // Hold a mapping from predicated instructions to their recipes, in order to 7662 // fix their AlsoPack behavior if a user is determined to replicate and use a 7663 // scalar instead of vector value. 7664 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7665 7666 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7667 7668 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7669 7670 // --------------------------------------------------------------------------- 7671 // Pre-construction: record ingredients whose recipes we'll need to further 7672 // process after constructing the initial VPlan. 7673 // --------------------------------------------------------------------------- 7674 7675 // Mark instructions we'll need to sink later and their targets as 7676 // ingredients whose recipe we'll need to record. 7677 for (auto &Entry : SinkAfter) { 7678 RecipeBuilder.recordRecipeOf(Entry.first); 7679 RecipeBuilder.recordRecipeOf(Entry.second); 7680 } 7681 for (auto &Reduction : CM.getInLoopReductionChains()) { 7682 PHINode *Phi = Reduction.first; 7683 RecurrenceDescriptor::RecurrenceKind Kind = 7684 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7685 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7686 7687 RecipeBuilder.recordRecipeOf(Phi); 7688 for (auto &R : ReductionOperations) { 7689 RecipeBuilder.recordRecipeOf(R); 7690 // For min/max reducitons, where we have a pair of icmp/select, we also 7691 // need to record the ICmp recipe, so it can be removed later. 7692 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7693 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7694 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7695 } 7696 } 7697 } 7698 7699 // For each interleave group which is relevant for this (possibly trimmed) 7700 // Range, add it to the set of groups to be later applied to the VPlan and add 7701 // placeholders for its members' Recipes which we'll be replacing with a 7702 // single VPInterleaveRecipe. 7703 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7704 auto applyIG = [IG, this](ElementCount VF) -> bool { 7705 return (VF.isVector() && // Query is illegal for VF == 1 7706 CM.getWideningDecision(IG->getInsertPos(), VF) == 7707 LoopVectorizationCostModel::CM_Interleave); 7708 }; 7709 if (!getDecisionAndClampRange(applyIG, Range)) 7710 continue; 7711 InterleaveGroups.insert(IG); 7712 for (unsigned i = 0; i < IG->getFactor(); i++) 7713 if (Instruction *Member = IG->getMember(i)) 7714 RecipeBuilder.recordRecipeOf(Member); 7715 }; 7716 7717 // --------------------------------------------------------------------------- 7718 // Build initial VPlan: Scan the body of the loop in a topological order to 7719 // visit each basic block after having visited its predecessor basic blocks. 7720 // --------------------------------------------------------------------------- 7721 7722 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7723 auto Plan = std::make_unique<VPlan>(); 7724 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7725 Plan->setEntry(VPBB); 7726 7727 // Represent values that will have defs inside VPlan. 7728 for (Value *V : NeedDef) 7729 Plan->addVPValue(V); 7730 7731 // Scan the body of the loop in a topological order to visit each basic block 7732 // after having visited its predecessor basic blocks. 7733 LoopBlocksDFS DFS(OrigLoop); 7734 DFS.perform(LI); 7735 7736 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7737 // Relevant instructions from basic block BB will be grouped into VPRecipe 7738 // ingredients and fill a new VPBasicBlock. 7739 unsigned VPBBsForBB = 0; 7740 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7741 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7742 VPBB = FirstVPBBForBB; 7743 Builder.setInsertPoint(VPBB); 7744 7745 // Introduce each ingredient into VPlan. 7746 // TODO: Model and preserve debug instrinsics in VPlan. 7747 for (Instruction &I : BB->instructionsWithoutDebug()) { 7748 Instruction *Instr = &I; 7749 7750 // First filter out irrelevant instructions, to ensure no recipes are 7751 // built for them. 7752 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7753 continue; 7754 7755 if (auto Recipe = 7756 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7757 RecipeBuilder.setRecipe(Instr, Recipe); 7758 VPBB->appendRecipe(Recipe); 7759 continue; 7760 } 7761 7762 // Otherwise, if all widening options failed, Instruction is to be 7763 // replicated. This may create a successor for VPBB. 7764 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7765 Instr, Range, VPBB, PredInst2Recipe, Plan); 7766 if (NextVPBB != VPBB) { 7767 VPBB = NextVPBB; 7768 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7769 : ""); 7770 } 7771 } 7772 } 7773 7774 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7775 // may also be empty, such as the last one VPBB, reflecting original 7776 // basic-blocks with no recipes. 7777 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7778 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7779 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7780 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7781 delete PreEntry; 7782 7783 // --------------------------------------------------------------------------- 7784 // Transform initial VPlan: Apply previously taken decisions, in order, to 7785 // bring the VPlan to its final state. 7786 // --------------------------------------------------------------------------- 7787 7788 // Apply Sink-After legal constraints. 7789 for (auto &Entry : SinkAfter) { 7790 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7791 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7792 Sink->moveAfter(Target); 7793 } 7794 7795 // Interleave memory: for each Interleave Group we marked earlier as relevant 7796 // for this VPlan, replace the Recipes widening its memory instructions with a 7797 // single VPInterleaveRecipe at its insertion point. 7798 for (auto IG : InterleaveGroups) { 7799 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7800 RecipeBuilder.getRecipe(IG->getInsertPos())); 7801 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7802 ->insertBefore(Recipe); 7803 7804 for (unsigned i = 0; i < IG->getFactor(); ++i) 7805 if (Instruction *Member = IG->getMember(i)) { 7806 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7807 } 7808 } 7809 7810 // Adjust the recipes for any inloop reductions. 7811 if (Range.Start > 1) 7812 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7813 7814 // Finally, if tail is folded by masking, introduce selects between the phi 7815 // and the live-out instruction of each reduction, at the end of the latch. 7816 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7817 Builder.setInsertPoint(VPBB); 7818 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7819 for (auto &Reduction : Legal->getReductionVars()) { 7820 if (CM.isInLoopReduction(Reduction.first)) 7821 continue; 7822 VPValue *Phi = Plan->getVPValue(Reduction.first); 7823 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7824 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7825 } 7826 } 7827 7828 std::string PlanName; 7829 raw_string_ostream RSO(PlanName); 7830 ElementCount VF = ElementCount::getFixed(Range.Start); 7831 Plan->addVF(VF); 7832 RSO << "Initial VPlan for VF={" << VF; 7833 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7834 Plan->addVF(VF); 7835 RSO << "," << VF; 7836 } 7837 RSO << "},UF>=1"; 7838 RSO.flush(); 7839 Plan->setName(PlanName); 7840 7841 return Plan; 7842 } 7843 7844 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7845 // Outer loop handling: They may require CFG and instruction level 7846 // transformations before even evaluating whether vectorization is profitable. 7847 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7848 // the vectorization pipeline. 7849 assert(!OrigLoop->isInnermost()); 7850 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7851 7852 // Create new empty VPlan 7853 auto Plan = std::make_unique<VPlan>(); 7854 7855 // Build hierarchical CFG 7856 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7857 HCFGBuilder.buildHierarchicalCFG(); 7858 7859 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7860 Plan->addVF(ElementCount::getFixed(VF)); 7861 7862 if (EnableVPlanPredication) { 7863 VPlanPredicator VPP(*Plan); 7864 VPP.predicate(); 7865 7866 // Avoid running transformation to recipes until masked code generation in 7867 // VPlan-native path is in place. 7868 return Plan; 7869 } 7870 7871 SmallPtrSet<Instruction *, 1> DeadInstructions; 7872 VPlanTransforms::VPInstructionsToVPRecipes( 7873 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7874 return Plan; 7875 } 7876 7877 // Adjust the recipes for any inloop reductions. The chain of instructions 7878 // leading from the loop exit instr to the phi need to be converted to 7879 // reductions, with one operand being vector and the other being the scalar 7880 // reduction chain. 7881 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7882 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7883 for (auto &Reduction : CM.getInLoopReductionChains()) { 7884 PHINode *Phi = Reduction.first; 7885 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7886 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7887 7888 // ReductionOperations are orders top-down from the phi's use to the 7889 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7890 // which of the two operands will remain scalar and which will be reduced. 7891 // For minmax the chain will be the select instructions. 7892 Instruction *Chain = Phi; 7893 for (Instruction *R : ReductionOperations) { 7894 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7895 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7896 7897 VPValue *ChainOp = Plan->getVPValue(Chain); 7898 unsigned FirstOpId; 7899 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7900 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7901 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 7902 "Expected to replace a VPWidenSelectSC"); 7903 FirstOpId = 1; 7904 } else { 7905 assert(isa<VPWidenRecipe>(WidenRecipe) && 7906 "Expected to replace a VPWidenSC"); 7907 FirstOpId = 0; 7908 } 7909 unsigned VecOpId = 7910 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7911 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7912 7913 auto *CondOp = CM.foldTailByMasking() 7914 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 7915 : nullptr; 7916 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7917 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 7918 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7919 WidenRecipe->eraseFromParent(); 7920 7921 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7922 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7923 VPRecipeBase *CompareRecipe = 7924 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7925 assert(isa<VPWidenRecipe>(CompareRecipe) && 7926 "Expected to replace a VPWidenSC"); 7927 CompareRecipe->eraseFromParent(); 7928 } 7929 Chain = R; 7930 } 7931 } 7932 } 7933 7934 Value* LoopVectorizationPlanner::VPCallbackILV:: 7935 getOrCreateVectorValues(Value *V, unsigned Part) { 7936 return ILV.getOrCreateVectorValue(V, Part); 7937 } 7938 7939 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7940 Value *V, const VPIteration &Instance) { 7941 return ILV.getOrCreateScalarValue(V, Instance); 7942 } 7943 7944 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7945 VPSlotTracker &SlotTracker) const { 7946 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7947 IG->getInsertPos()->printAsOperand(O, false); 7948 O << ", "; 7949 getAddr()->printAsOperand(O, SlotTracker); 7950 VPValue *Mask = getMask(); 7951 if (Mask) { 7952 O << ", "; 7953 Mask->printAsOperand(O, SlotTracker); 7954 } 7955 for (unsigned i = 0; i < IG->getFactor(); ++i) 7956 if (Instruction *I = IG->getMember(i)) 7957 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7958 } 7959 7960 void VPWidenCallRecipe::execute(VPTransformState &State) { 7961 State.ILV->widenCallInstruction(Ingredient, *this, State); 7962 } 7963 7964 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7965 State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State); 7966 } 7967 7968 void VPWidenRecipe::execute(VPTransformState &State) { 7969 State.ILV->widenInstruction(Ingredient, *this, State); 7970 } 7971 7972 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7973 State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant, 7974 IsIndexLoopInvariant, State); 7975 } 7976 7977 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7978 assert(!State.Instance && "Int or FP induction being replicated."); 7979 State.ILV->widenIntOrFpInduction(IV, Trunc); 7980 } 7981 7982 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7983 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7984 } 7985 7986 void VPBlendRecipe::execute(VPTransformState &State) { 7987 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7988 // We know that all PHIs in non-header blocks are converted into 7989 // selects, so we don't have to worry about the insertion order and we 7990 // can just use the builder. 7991 // At this point we generate the predication tree. There may be 7992 // duplications since this is a simple recursive scan, but future 7993 // optimizations will clean it up. 7994 7995 unsigned NumIncoming = getNumIncomingValues(); 7996 7997 // Generate a sequence of selects of the form: 7998 // SELECT(Mask3, In3, 7999 // SELECT(Mask2, In2, 8000 // SELECT(Mask1, In1, 8001 // In0))) 8002 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8003 // are essentially undef are taken from In0. 8004 InnerLoopVectorizer::VectorParts Entry(State.UF); 8005 for (unsigned In = 0; In < NumIncoming; ++In) { 8006 for (unsigned Part = 0; Part < State.UF; ++Part) { 8007 // We might have single edge PHIs (blocks) - use an identity 8008 // 'select' for the first PHI operand. 8009 Value *In0 = State.get(getIncomingValue(In), Part); 8010 if (In == 0) 8011 Entry[Part] = In0; // Initialize with the first incoming value. 8012 else { 8013 // Select between the current value and the previous incoming edge 8014 // based on the incoming mask. 8015 Value *Cond = State.get(getMask(In), Part); 8016 Entry[Part] = 8017 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8018 } 8019 } 8020 } 8021 for (unsigned Part = 0; Part < State.UF; ++Part) 8022 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8023 } 8024 8025 void VPInterleaveRecipe::execute(VPTransformState &State) { 8026 assert(!State.Instance && "Interleave group being replicated."); 8027 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 8028 } 8029 8030 void VPReductionRecipe::execute(VPTransformState &State) { 8031 assert(!State.Instance && "Reduction being replicated."); 8032 for (unsigned Part = 0; Part < State.UF; ++Part) { 8033 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8034 Value *NewVecOp = State.get(VecOp, Part); 8035 if (CondOp) { 8036 Value *NewCond = State.get(CondOp, Part); 8037 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8038 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8039 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8040 Constant *IdenVec = 8041 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8042 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8043 NewVecOp = Select; 8044 } 8045 Value *NewRed = 8046 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8047 Value *PrevInChain = State.get(ChainOp, Part); 8048 Value *NextInChain; 8049 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8050 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8051 NextInChain = 8052 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8053 NewRed, PrevInChain); 8054 } else { 8055 NextInChain = State.Builder.CreateBinOp( 8056 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8057 } 8058 State.ValueMap.setVectorValue(I, Part, NextInChain); 8059 } 8060 } 8061 8062 void VPReplicateRecipe::execute(VPTransformState &State) { 8063 if (State.Instance) { // Generate a single instance. 8064 State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, 8065 IsPredicated, State); 8066 // Insert scalar instance packing it into a vector. 8067 if (AlsoPack && State.VF.isVector()) { 8068 // If we're constructing lane 0, initialize to start from undef. 8069 if (State.Instance->Lane == 0) { 8070 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8071 Value *Undef = 8072 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8073 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8074 } 8075 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8076 } 8077 return; 8078 } 8079 8080 // Generate scalar instances for all VF lanes of all UF parts, unless the 8081 // instruction is uniform inwhich case generate only the first lane for each 8082 // of the UF parts. 8083 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8084 for (unsigned Part = 0; Part < State.UF; ++Part) 8085 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8086 State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane}, 8087 IsPredicated, State); 8088 } 8089 8090 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8091 assert(State.Instance && "Branch on Mask works only on single instance."); 8092 8093 unsigned Part = State.Instance->Part; 8094 unsigned Lane = State.Instance->Lane; 8095 8096 Value *ConditionBit = nullptr; 8097 VPValue *BlockInMask = getMask(); 8098 if (BlockInMask) { 8099 ConditionBit = State.get(BlockInMask, Part); 8100 if (ConditionBit->getType()->isVectorTy()) 8101 ConditionBit = State.Builder.CreateExtractElement( 8102 ConditionBit, State.Builder.getInt32(Lane)); 8103 } else // Block in mask is all-one. 8104 ConditionBit = State.Builder.getTrue(); 8105 8106 // Replace the temporary unreachable terminator with a new conditional branch, 8107 // whose two destinations will be set later when they are created. 8108 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8109 assert(isa<UnreachableInst>(CurrentTerminator) && 8110 "Expected to replace unreachable terminator with conditional branch."); 8111 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8112 CondBr->setSuccessor(0, nullptr); 8113 ReplaceInstWithInst(CurrentTerminator, CondBr); 8114 } 8115 8116 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8117 assert(State.Instance && "Predicated instruction PHI works per instance."); 8118 Instruction *ScalarPredInst = cast<Instruction>( 8119 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8120 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8121 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8122 assert(PredicatingBB && "Predicated block has no single predecessor."); 8123 8124 // By current pack/unpack logic we need to generate only a single phi node: if 8125 // a vector value for the predicated instruction exists at this point it means 8126 // the instruction has vector users only, and a phi for the vector value is 8127 // needed. In this case the recipe of the predicated instruction is marked to 8128 // also do that packing, thereby "hoisting" the insert-element sequence. 8129 // Otherwise, a phi node for the scalar value is needed. 8130 unsigned Part = State.Instance->Part; 8131 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8132 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8133 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8134 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8135 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8136 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8137 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8138 } else { 8139 Type *PredInstType = PredInst->getType(); 8140 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8141 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8142 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8143 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8144 } 8145 } 8146 8147 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8148 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8149 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8150 getMask()); 8151 } 8152 8153 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8154 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8155 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8156 // for predication. 8157 static ScalarEpilogueLowering getScalarEpilogueLowering( 8158 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8159 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8160 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8161 LoopVectorizationLegality &LVL) { 8162 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8163 // don't look at hints or options, and don't request a scalar epilogue. 8164 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8165 // LoopAccessInfo (due to code dependency and not being able to reliably get 8166 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8167 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8168 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8169 // back to the old way and vectorize with versioning when forced. See D81345.) 8170 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8171 PGSOQueryType::IRPass) && 8172 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8173 return CM_ScalarEpilogueNotAllowedOptSize; 8174 8175 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8176 !PreferPredicateOverEpilogue; 8177 8178 // 2) Next, if disabling predication is requested on the command line, honour 8179 // this and request a scalar epilogue. 8180 if (PredicateOptDisabled) 8181 return CM_ScalarEpilogueAllowed; 8182 8183 // 3) and 4) look if enabling predication is requested on the command line, 8184 // with a loop hint, or if the TTI hook indicates this is profitable, request 8185 // predication. 8186 if (PreferPredicateOverEpilogue || 8187 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8188 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8189 LVL.getLAI()) && 8190 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8191 return CM_ScalarEpilogueNotNeededUsePredicate; 8192 8193 return CM_ScalarEpilogueAllowed; 8194 } 8195 8196 // Process the loop in the VPlan-native vectorization path. This path builds 8197 // VPlan upfront in the vectorization pipeline, which allows to apply 8198 // VPlan-to-VPlan transformations from the very beginning without modifying the 8199 // input LLVM IR. 8200 static bool processLoopInVPlanNativePath( 8201 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8202 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8203 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8204 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8205 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8206 8207 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8208 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8209 return false; 8210 } 8211 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8212 Function *F = L->getHeader()->getParent(); 8213 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8214 8215 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8216 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8217 8218 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8219 &Hints, IAI); 8220 // Use the planner for outer loop vectorization. 8221 // TODO: CM is not used at this point inside the planner. Turn CM into an 8222 // optional argument if we don't need it in the future. 8223 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8224 8225 // Get user vectorization factor. 8226 const unsigned UserVF = Hints.getWidth(); 8227 8228 // Plan how to best vectorize, return the best VF and its cost. 8229 const VectorizationFactor VF = 8230 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8231 8232 // If we are stress testing VPlan builds, do not attempt to generate vector 8233 // code. Masked vector code generation support will follow soon. 8234 // Also, do not attempt to vectorize if no vector code will be produced. 8235 if (VPlanBuildStressTest || EnableVPlanPredication || 8236 VectorizationFactor::Disabled() == VF) 8237 return false; 8238 8239 LVP.setBestPlan(VF.Width, 1); 8240 8241 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8242 &CM, BFI, PSI); 8243 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8244 << L->getHeader()->getParent()->getName() << "\"\n"); 8245 LVP.executePlan(LB, DT); 8246 8247 // Mark the loop as already vectorized to avoid vectorizing again. 8248 Hints.setAlreadyVectorized(); 8249 8250 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8251 return true; 8252 } 8253 8254 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8255 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8256 !EnableLoopInterleaving), 8257 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8258 !EnableLoopVectorization) {} 8259 8260 bool LoopVectorizePass::processLoop(Loop *L) { 8261 assert((EnableVPlanNativePath || L->isInnermost()) && 8262 "VPlan-native path is not enabled. Only process inner loops."); 8263 8264 #ifndef NDEBUG 8265 const std::string DebugLocStr = getDebugLocString(L); 8266 #endif /* NDEBUG */ 8267 8268 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8269 << L->getHeader()->getParent()->getName() << "\" from " 8270 << DebugLocStr << "\n"); 8271 8272 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8273 8274 LLVM_DEBUG( 8275 dbgs() << "LV: Loop hints:" 8276 << " force=" 8277 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8278 ? "disabled" 8279 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8280 ? "enabled" 8281 : "?")) 8282 << " width=" << Hints.getWidth() 8283 << " unroll=" << Hints.getInterleave() << "\n"); 8284 8285 // Function containing loop 8286 Function *F = L->getHeader()->getParent(); 8287 8288 // Looking at the diagnostic output is the only way to determine if a loop 8289 // was vectorized (other than looking at the IR or machine code), so it 8290 // is important to generate an optimization remark for each loop. Most of 8291 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8292 // generated as OptimizationRemark and OptimizationRemarkMissed are 8293 // less verbose reporting vectorized loops and unvectorized loops that may 8294 // benefit from vectorization, respectively. 8295 8296 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8297 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8298 return false; 8299 } 8300 8301 PredicatedScalarEvolution PSE(*SE, *L); 8302 8303 // Check if it is legal to vectorize the loop. 8304 LoopVectorizationRequirements Requirements(*ORE); 8305 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8306 &Requirements, &Hints, DB, AC, BFI, PSI); 8307 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8308 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8309 Hints.emitRemarkWithHints(); 8310 return false; 8311 } 8312 8313 // Check the function attributes and profiles to find out if this function 8314 // should be optimized for size. 8315 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8316 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8317 8318 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8319 // here. They may require CFG and instruction level transformations before 8320 // even evaluating whether vectorization is profitable. Since we cannot modify 8321 // the incoming IR, we need to build VPlan upfront in the vectorization 8322 // pipeline. 8323 if (!L->isInnermost()) 8324 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8325 ORE, BFI, PSI, Hints); 8326 8327 assert(L->isInnermost() && "Inner loop expected."); 8328 8329 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8330 // count by optimizing for size, to minimize overheads. 8331 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8332 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8333 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8334 << "This loop is worth vectorizing only if no scalar " 8335 << "iteration overheads are incurred."); 8336 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8337 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8338 else { 8339 LLVM_DEBUG(dbgs() << "\n"); 8340 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8341 } 8342 } 8343 8344 // Check the function attributes to see if implicit floats are allowed. 8345 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8346 // an integer loop and the vector instructions selected are purely integer 8347 // vector instructions? 8348 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8349 reportVectorizationFailure( 8350 "Can't vectorize when the NoImplicitFloat attribute is used", 8351 "loop not vectorized due to NoImplicitFloat attribute", 8352 "NoImplicitFloat", ORE, L); 8353 Hints.emitRemarkWithHints(); 8354 return false; 8355 } 8356 8357 // Check if the target supports potentially unsafe FP vectorization. 8358 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8359 // for the target we're vectorizing for, to make sure none of the 8360 // additional fp-math flags can help. 8361 if (Hints.isPotentiallyUnsafe() && 8362 TTI->isFPVectorizationPotentiallyUnsafe()) { 8363 reportVectorizationFailure( 8364 "Potentially unsafe FP op prevents vectorization", 8365 "loop not vectorized due to unsafe FP support.", 8366 "UnsafeFP", ORE, L); 8367 Hints.emitRemarkWithHints(); 8368 return false; 8369 } 8370 8371 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8372 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8373 8374 // If an override option has been passed in for interleaved accesses, use it. 8375 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8376 UseInterleaved = EnableInterleavedMemAccesses; 8377 8378 // Analyze interleaved memory accesses. 8379 if (UseInterleaved) { 8380 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8381 } 8382 8383 // Use the cost model. 8384 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8385 F, &Hints, IAI); 8386 CM.collectValuesToIgnore(); 8387 8388 // Use the planner for vectorization. 8389 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8390 8391 // Get user vectorization factor and interleave count. 8392 unsigned UserVF = Hints.getWidth(); 8393 unsigned UserIC = Hints.getInterleave(); 8394 8395 // Plan how to best vectorize, return the best VF and its cost. 8396 Optional<VectorizationFactor> MaybeVF = 8397 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8398 8399 VectorizationFactor VF = VectorizationFactor::Disabled(); 8400 unsigned IC = 1; 8401 8402 if (MaybeVF) { 8403 VF = *MaybeVF; 8404 // Select the interleave count. 8405 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8406 } 8407 8408 // Identify the diagnostic messages that should be produced. 8409 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8410 bool VectorizeLoop = true, InterleaveLoop = true; 8411 if (Requirements.doesNotMeet(F, L, Hints)) { 8412 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8413 "requirements.\n"); 8414 Hints.emitRemarkWithHints(); 8415 return false; 8416 } 8417 8418 if (VF.Width.isScalar()) { 8419 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8420 VecDiagMsg = std::make_pair( 8421 "VectorizationNotBeneficial", 8422 "the cost-model indicates that vectorization is not beneficial"); 8423 VectorizeLoop = false; 8424 } 8425 8426 if (!MaybeVF && UserIC > 1) { 8427 // Tell the user interleaving was avoided up-front, despite being explicitly 8428 // requested. 8429 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8430 "interleaving should be avoided up front\n"); 8431 IntDiagMsg = std::make_pair( 8432 "InterleavingAvoided", 8433 "Ignoring UserIC, because interleaving was avoided up front"); 8434 InterleaveLoop = false; 8435 } else if (IC == 1 && UserIC <= 1) { 8436 // Tell the user interleaving is not beneficial. 8437 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8438 IntDiagMsg = std::make_pair( 8439 "InterleavingNotBeneficial", 8440 "the cost-model indicates that interleaving is not beneficial"); 8441 InterleaveLoop = false; 8442 if (UserIC == 1) { 8443 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8444 IntDiagMsg.second += 8445 " and is explicitly disabled or interleave count is set to 1"; 8446 } 8447 } else if (IC > 1 && UserIC == 1) { 8448 // Tell the user interleaving is beneficial, but it explicitly disabled. 8449 LLVM_DEBUG( 8450 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8451 IntDiagMsg = std::make_pair( 8452 "InterleavingBeneficialButDisabled", 8453 "the cost-model indicates that interleaving is beneficial " 8454 "but is explicitly disabled or interleave count is set to 1"); 8455 InterleaveLoop = false; 8456 } 8457 8458 // Override IC if user provided an interleave count. 8459 IC = UserIC > 0 ? UserIC : IC; 8460 8461 // Emit diagnostic messages, if any. 8462 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8463 if (!VectorizeLoop && !InterleaveLoop) { 8464 // Do not vectorize or interleaving the loop. 8465 ORE->emit([&]() { 8466 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8467 L->getStartLoc(), L->getHeader()) 8468 << VecDiagMsg.second; 8469 }); 8470 ORE->emit([&]() { 8471 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8472 L->getStartLoc(), L->getHeader()) 8473 << IntDiagMsg.second; 8474 }); 8475 return false; 8476 } else if (!VectorizeLoop && InterleaveLoop) { 8477 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8478 ORE->emit([&]() { 8479 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8480 L->getStartLoc(), L->getHeader()) 8481 << VecDiagMsg.second; 8482 }); 8483 } else if (VectorizeLoop && !InterleaveLoop) { 8484 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8485 << ") in " << DebugLocStr << '\n'); 8486 ORE->emit([&]() { 8487 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8488 L->getStartLoc(), L->getHeader()) 8489 << IntDiagMsg.second; 8490 }); 8491 } else if (VectorizeLoop && InterleaveLoop) { 8492 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8493 << ") in " << DebugLocStr << '\n'); 8494 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8495 } 8496 8497 LVP.setBestPlan(VF.Width, IC); 8498 8499 using namespace ore; 8500 bool DisableRuntimeUnroll = false; 8501 MDNode *OrigLoopID = L->getLoopID(); 8502 8503 if (!VectorizeLoop) { 8504 assert(IC > 1 && "interleave count should not be 1 or 0"); 8505 // If we decided that it is not legal to vectorize the loop, then 8506 // interleave it. 8507 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8508 BFI, PSI); 8509 LVP.executePlan(Unroller, DT); 8510 8511 ORE->emit([&]() { 8512 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8513 L->getHeader()) 8514 << "interleaved loop (interleaved count: " 8515 << NV("InterleaveCount", IC) << ")"; 8516 }); 8517 } else { 8518 // If we decided that it is *legal* to vectorize the loop, then do it. 8519 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8520 &LVL, &CM, BFI, PSI); 8521 LVP.executePlan(LB, DT); 8522 ++LoopsVectorized; 8523 8524 // Add metadata to disable runtime unrolling a scalar loop when there are 8525 // no runtime checks about strides and memory. A scalar loop that is 8526 // rarely used is not worth unrolling. 8527 if (!LB.areSafetyChecksAdded()) 8528 DisableRuntimeUnroll = true; 8529 8530 // Report the vectorization decision. 8531 ORE->emit([&]() { 8532 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8533 L->getHeader()) 8534 << "vectorized loop (vectorization width: " 8535 << NV("VectorizationFactor", VF.Width) 8536 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8537 }); 8538 } 8539 8540 Optional<MDNode *> RemainderLoopID = 8541 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8542 LLVMLoopVectorizeFollowupEpilogue}); 8543 if (RemainderLoopID.hasValue()) { 8544 L->setLoopID(RemainderLoopID.getValue()); 8545 } else { 8546 if (DisableRuntimeUnroll) 8547 AddRuntimeUnrollDisableMetaData(L); 8548 8549 // Mark the loop as already vectorized to avoid vectorizing again. 8550 Hints.setAlreadyVectorized(); 8551 } 8552 8553 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8554 return true; 8555 } 8556 8557 LoopVectorizeResult LoopVectorizePass::runImpl( 8558 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8559 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8560 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8561 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8562 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8563 SE = &SE_; 8564 LI = &LI_; 8565 TTI = &TTI_; 8566 DT = &DT_; 8567 BFI = &BFI_; 8568 TLI = TLI_; 8569 AA = &AA_; 8570 AC = &AC_; 8571 GetLAA = &GetLAA_; 8572 DB = &DB_; 8573 ORE = &ORE_; 8574 PSI = PSI_; 8575 8576 // Don't attempt if 8577 // 1. the target claims to have no vector registers, and 8578 // 2. interleaving won't help ILP. 8579 // 8580 // The second condition is necessary because, even if the target has no 8581 // vector registers, loop vectorization may still enable scalar 8582 // interleaving. 8583 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8584 TTI->getMaxInterleaveFactor(1) < 2) 8585 return LoopVectorizeResult(false, false); 8586 8587 bool Changed = false, CFGChanged = false; 8588 8589 // The vectorizer requires loops to be in simplified form. 8590 // Since simplification may add new inner loops, it has to run before the 8591 // legality and profitability checks. This means running the loop vectorizer 8592 // will simplify all loops, regardless of whether anything end up being 8593 // vectorized. 8594 for (auto &L : *LI) 8595 Changed |= CFGChanged |= 8596 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8597 8598 // Build up a worklist of inner-loops to vectorize. This is necessary as 8599 // the act of vectorizing or partially unrolling a loop creates new loops 8600 // and can invalidate iterators across the loops. 8601 SmallVector<Loop *, 8> Worklist; 8602 8603 for (Loop *L : *LI) 8604 collectSupportedLoops(*L, LI, ORE, Worklist); 8605 8606 LoopsAnalyzed += Worklist.size(); 8607 8608 // Now walk the identified inner loops. 8609 while (!Worklist.empty()) { 8610 Loop *L = Worklist.pop_back_val(); 8611 8612 // For the inner loops we actually process, form LCSSA to simplify the 8613 // transform. 8614 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8615 8616 Changed |= CFGChanged |= processLoop(L); 8617 } 8618 8619 // Process each loop nest in the function. 8620 return LoopVectorizeResult(Changed, CFGChanged); 8621 } 8622 8623 PreservedAnalyses LoopVectorizePass::run(Function &F, 8624 FunctionAnalysisManager &AM) { 8625 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8626 auto &LI = AM.getResult<LoopAnalysis>(F); 8627 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8628 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8629 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8630 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8631 auto &AA = AM.getResult<AAManager>(F); 8632 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8633 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8634 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8635 MemorySSA *MSSA = EnableMSSALoopDependency 8636 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8637 : nullptr; 8638 8639 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8640 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8641 [&](Loop &L) -> const LoopAccessInfo & { 8642 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 8643 TLI, TTI, nullptr, MSSA}; 8644 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8645 }; 8646 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8647 ProfileSummaryInfo *PSI = 8648 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8649 LoopVectorizeResult Result = 8650 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8651 if (!Result.MadeAnyChange) 8652 return PreservedAnalyses::all(); 8653 PreservedAnalyses PA; 8654 8655 // We currently do not preserve loopinfo/dominator analyses with outer loop 8656 // vectorization. Until this is addressed, mark these analyses as preserved 8657 // only for non-VPlan-native path. 8658 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8659 if (!EnableVPlanNativePath) { 8660 PA.preserve<LoopAnalysis>(); 8661 PA.preserve<DominatorTreeAnalysis>(); 8662 } 8663 PA.preserve<BasicAA>(); 8664 PA.preserve<GlobalsAA>(); 8665 if (!Result.MadeCFGChange) 8666 PA.preserveSet<CFGAnalyses>(); 8667 return PA; 8668 } 8669