1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } // namespace PreferPredicateTy 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// Interleave small loops with scalar reductions. 274 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 275 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 276 cl::desc("Enable interleaving for loops with small iteration counts that " 277 "contain scalar reductions to expose ILP.")); 278 279 /// The number of stores in a loop that are allowed to need predication. 280 static cl::opt<unsigned> NumberOfStoresToPredicate( 281 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 282 cl::desc("Max number of stores to be predicated behind an if.")); 283 284 static cl::opt<bool> EnableIndVarRegisterHeur( 285 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 286 cl::desc("Count the induction variable only once when interleaving")); 287 288 static cl::opt<bool> EnableCondStoresVectorization( 289 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 290 cl::desc("Enable if predication of stores during vectorization.")); 291 292 static cl::opt<unsigned> MaxNestedScalarReductionIC( 293 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 294 cl::desc("The maximum interleave count to use when interleaving a scalar " 295 "reduction in a nested loop.")); 296 297 static cl::opt<bool> 298 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 299 cl::Hidden, 300 cl::desc("Prefer in-loop vector reductions, " 301 "overriding the targets preference.")); 302 303 static cl::opt<bool> PreferPredicatedReductionSelect( 304 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 305 cl::desc( 306 "Prefer predicating a reduction operation over an after loop select.")); 307 308 cl::opt<bool> EnableVPlanNativePath( 309 "enable-vplan-native-path", cl::init(false), cl::Hidden, 310 cl::desc("Enable VPlan-native vectorization path with " 311 "support for outer loop vectorization.")); 312 313 // FIXME: Remove this switch once we have divergence analysis. Currently we 314 // assume divergent non-backedge branches when this switch is true. 315 cl::opt<bool> EnableVPlanPredication( 316 "enable-vplan-predication", cl::init(false), cl::Hidden, 317 cl::desc("Enable VPlan-native vectorization path predicator with " 318 "support for outer loop vectorization.")); 319 320 // This flag enables the stress testing of the VPlan H-CFG construction in the 321 // VPlan-native vectorization path. It must be used in conjuction with 322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 323 // verification of the H-CFGs built. 324 static cl::opt<bool> VPlanBuildStressTest( 325 "vplan-build-stress-test", cl::init(false), cl::Hidden, 326 cl::desc( 327 "Build VPlan for every supported loop nest in the function and bail " 328 "out right after the build (stress test the VPlan H-CFG construction " 329 "in the VPlan-native vectorization path).")); 330 331 cl::opt<bool> llvm::EnableLoopInterleaving( 332 "interleave-loops", cl::init(true), cl::Hidden, 333 cl::desc("Enable loop interleaving in Loop vectorization passes")); 334 cl::opt<bool> llvm::EnableLoopVectorization( 335 "vectorize-loops", cl::init(true), cl::Hidden, 336 cl::desc("Run the Loop vectorization passes")); 337 338 /// A helper function that returns the type of loaded or stored value. 339 static Type *getMemInstValueType(Value *I) { 340 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 341 "Expected Load or Store instruction"); 342 if (auto *LI = dyn_cast<LoadInst>(I)) 343 return LI->getType(); 344 return cast<StoreInst>(I)->getValueOperand()->getType(); 345 } 346 347 /// A helper function that returns true if the given type is irregular. The 348 /// type is irregular if its allocated size doesn't equal the store size of an 349 /// element of the corresponding vector type at the given vectorization factor. 350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 351 assert(!VF.isScalable() && "scalable vectors not yet supported."); 352 // Determine if an array of VF elements of type Ty is "bitcast compatible" 353 // with a <VF x Ty> vector. 354 if (VF.isVector()) { 355 auto *VectorTy = VectorType::get(Ty, VF); 356 return TypeSize::get(VF.getKnownMinValue() * 357 DL.getTypeAllocSize(Ty).getFixedValue(), 358 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 359 } 360 361 // If the vectorization factor is one, we just check if an array of type Ty 362 // requires padding between elements. 363 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 364 } 365 366 /// A helper function that returns the reciprocal of the block probability of 367 /// predicated blocks. If we return X, we are assuming the predicated block 368 /// will execute once for every X iterations of the loop header. 369 /// 370 /// TODO: We should use actual block probability here, if available. Currently, 371 /// we always assume predicated blocks have a 50% chance of executing. 372 static unsigned getReciprocalPredBlockProb() { return 2; } 373 374 /// A helper function that adds a 'fast' flag to floating-point operations. 375 static Value *addFastMathFlag(Value *V) { 376 if (isa<FPMathOperator>(V)) 377 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 378 return V; 379 } 380 381 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 382 if (isa<FPMathOperator>(V)) 383 cast<Instruction>(V)->setFastMathFlags(FMF); 384 return V; 385 } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 namespace llvm { 418 419 /// InnerLoopVectorizer vectorizes loops which contain only one basic 420 /// block to a specified vectorization factor (VF). 421 /// This class performs the widening of scalars into vectors, or multiple 422 /// scalars. This class also implements the following features: 423 /// * It inserts an epilogue loop for handling loops that don't have iteration 424 /// counts that are known to be a multiple of the vectorization factor. 425 /// * It handles the code generation for reduction variables. 426 /// * Scalarization (implementation using scalars) of un-vectorizable 427 /// instructions. 428 /// InnerLoopVectorizer does not perform any vectorization-legality 429 /// checks, and relies on the caller to check for the different legality 430 /// aspects. The InnerLoopVectorizer relies on the 431 /// LoopVectorizationLegality class to provide information about the induction 432 /// and reduction variables that were found to a given vectorization factor. 433 class InnerLoopVectorizer { 434 public: 435 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 436 LoopInfo *LI, DominatorTree *DT, 437 const TargetLibraryInfo *TLI, 438 const TargetTransformInfo *TTI, AssumptionCache *AC, 439 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 440 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 441 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 442 ProfileSummaryInfo *PSI) 443 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 444 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 445 Builder(PSE.getSE()->getContext()), 446 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 447 BFI(BFI), PSI(PSI) { 448 // Query this against the original loop and save it here because the profile 449 // of the original loop header may change as the transformation happens. 450 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 451 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 452 } 453 454 virtual ~InnerLoopVectorizer() = default; 455 456 /// Create a new empty loop that will contain vectorized instructions later 457 /// on, while the old loop will be used as the scalar remainder. Control flow 458 /// is generated around the vectorized (and scalar epilogue) loops consisting 459 /// of various checks and bypasses. Return the pre-header block of the new 460 /// loop. 461 BasicBlock *createVectorizedLoopSkeleton(); 462 463 /// Widen a single instruction within the innermost loop. 464 void widenInstruction(Instruction &I, VPUser &Operands, 465 VPTransformState &State); 466 467 /// Widen a single call instruction within the innermost loop. 468 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 469 VPTransformState &State); 470 471 /// Widen a single select instruction within the innermost loop. 472 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 473 bool InvariantCond, VPTransformState &State); 474 475 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 476 void fixVectorizedLoop(); 477 478 // Return true if any runtime check is added. 479 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 480 481 /// A type for vectorized values in the new loop. Each value from the 482 /// original loop, when vectorized, is represented by UF vector values in the 483 /// new unrolled loop, where UF is the unroll factor. 484 using VectorParts = SmallVector<Value *, 2>; 485 486 /// Vectorize a single GetElementPtrInst based on information gathered and 487 /// decisions taken during planning. 488 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 489 ElementCount VF, bool IsPtrLoopInvariant, 490 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 491 492 /// Vectorize a single PHINode in a block. This method handles the induction 493 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 494 /// arbitrary length vectors. 495 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 496 497 /// A helper function to scalarize a single Instruction in the innermost loop. 498 /// Generates a sequence of scalar instances for each lane between \p MinLane 499 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 500 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 501 /// Instr's operands. 502 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 503 const VPIteration &Instance, bool IfPredicateInstr, 504 VPTransformState &State); 505 506 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 507 /// is provided, the integer induction variable will first be truncated to 508 /// the corresponding type. 509 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 510 511 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 512 /// vector or scalar value on-demand if one is not yet available. When 513 /// vectorizing a loop, we visit the definition of an instruction before its 514 /// uses. When visiting the definition, we either vectorize or scalarize the 515 /// instruction, creating an entry for it in the corresponding map. (In some 516 /// cases, such as induction variables, we will create both vector and scalar 517 /// entries.) Then, as we encounter uses of the definition, we derive values 518 /// for each scalar or vector use unless such a value is already available. 519 /// For example, if we scalarize a definition and one of its uses is vector, 520 /// we build the required vector on-demand with an insertelement sequence 521 /// when visiting the use. Otherwise, if the use is scalar, we can use the 522 /// existing scalar definition. 523 /// 524 /// Return a value in the new loop corresponding to \p V from the original 525 /// loop at unroll index \p Part. If the value has already been vectorized, 526 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 527 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 528 /// a new vector value on-demand by inserting the scalar values into a vector 529 /// with an insertelement sequence. If the value has been neither vectorized 530 /// nor scalarized, it must be loop invariant, so we simply broadcast the 531 /// value into a vector. 532 Value *getOrCreateVectorValue(Value *V, unsigned Part); 533 534 /// Return a value in the new loop corresponding to \p V from the original 535 /// loop at unroll and vector indices \p Instance. If the value has been 536 /// vectorized but not scalarized, the necessary extractelement instruction 537 /// will be generated. 538 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 539 540 /// Construct the vector value of a scalarized value \p V one lane at a time. 541 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 542 543 /// Try to vectorize interleaved access group \p Group with the base address 544 /// given in \p Addr, optionally masking the vector operations if \p 545 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 546 /// values in the vectorized loop. 547 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 548 VPTransformState &State, VPValue *Addr, 549 VPValue *BlockInMask = nullptr); 550 551 /// Vectorize Load and Store instructions with the base address given in \p 552 /// Addr, optionally masking the vector operations if \p BlockInMask is 553 /// non-null. Use \p State to translate given VPValues to IR values in the 554 /// vectorized loop. 555 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 556 VPValue *Addr, VPValue *StoredValue, 557 VPValue *BlockInMask); 558 559 /// Set the debug location in the builder using the debug location in 560 /// the instruction. 561 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 562 563 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 564 void fixNonInductionPHIs(void); 565 566 protected: 567 friend class LoopVectorizationPlanner; 568 569 /// A small list of PHINodes. 570 using PhiVector = SmallVector<PHINode *, 4>; 571 572 /// A type for scalarized values in the new loop. Each value from the 573 /// original loop, when scalarized, is represented by UF x VF scalar values 574 /// in the new unrolled loop, where UF is the unroll factor and VF is the 575 /// vectorization factor. 576 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 577 578 /// Set up the values of the IVs correctly when exiting the vector loop. 579 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 580 Value *CountRoundDown, Value *EndValue, 581 BasicBlock *MiddleBlock); 582 583 /// Create a new induction variable inside L. 584 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 585 Value *Step, Instruction *DL); 586 587 /// Handle all cross-iteration phis in the header. 588 void fixCrossIterationPHIs(); 589 590 /// Fix a first-order recurrence. This is the second phase of vectorizing 591 /// this phi node. 592 void fixFirstOrderRecurrence(PHINode *Phi); 593 594 /// Fix a reduction cross-iteration phi. This is the second phase of 595 /// vectorizing this phi node. 596 void fixReduction(PHINode *Phi); 597 598 /// Clear NSW/NUW flags from reduction instructions if necessary. 599 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 600 601 /// The Loop exit block may have single value PHI nodes with some 602 /// incoming value. While vectorizing we only handled real values 603 /// that were defined inside the loop and we should have one value for 604 /// each predecessor of its parent basic block. See PR14725. 605 void fixLCSSAPHIs(); 606 607 /// Iteratively sink the scalarized operands of a predicated instruction into 608 /// the block that was created for it. 609 void sinkScalarOperands(Instruction *PredInst); 610 611 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 612 /// represented as. 613 void truncateToMinimalBitwidths(); 614 615 /// Create a broadcast instruction. This method generates a broadcast 616 /// instruction (shuffle) for loop invariant values and for the induction 617 /// value. If this is the induction variable then we extend it to N, N+1, ... 618 /// this is needed because each iteration in the loop corresponds to a SIMD 619 /// element. 620 virtual Value *getBroadcastInstrs(Value *V); 621 622 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 623 /// to each vector element of Val. The sequence starts at StartIndex. 624 /// \p Opcode is relevant for FP induction variable. 625 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 626 Instruction::BinaryOps Opcode = 627 Instruction::BinaryOpsEnd); 628 629 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 630 /// variable on which to base the steps, \p Step is the size of the step, and 631 /// \p EntryVal is the value from the original loop that maps to the steps. 632 /// Note that \p EntryVal doesn't have to be an induction variable - it 633 /// can also be a truncate instruction. 634 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 635 const InductionDescriptor &ID); 636 637 /// Create a vector induction phi node based on an existing scalar one. \p 638 /// EntryVal is the value from the original loop that maps to the vector phi 639 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 640 /// truncate instruction, instead of widening the original IV, we widen a 641 /// version of the IV truncated to \p EntryVal's type. 642 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 643 Value *Step, Instruction *EntryVal); 644 645 /// Returns true if an instruction \p I should be scalarized instead of 646 /// vectorized for the chosen vectorization factor. 647 bool shouldScalarizeInstruction(Instruction *I) const; 648 649 /// Returns true if we should generate a scalar version of \p IV. 650 bool needsScalarInduction(Instruction *IV) const; 651 652 /// If there is a cast involved in the induction variable \p ID, which should 653 /// be ignored in the vectorized loop body, this function records the 654 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 655 /// cast. We had already proved that the casted Phi is equal to the uncasted 656 /// Phi in the vectorized loop (under a runtime guard), and therefore 657 /// there is no need to vectorize the cast - the same value can be used in the 658 /// vector loop for both the Phi and the cast. 659 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 660 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 661 /// 662 /// \p EntryVal is the value from the original loop that maps to the vector 663 /// phi node and is used to distinguish what is the IV currently being 664 /// processed - original one (if \p EntryVal is a phi corresponding to the 665 /// original IV) or the "newly-created" one based on the proof mentioned above 666 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 667 /// latter case \p EntryVal is a TruncInst and we must not record anything for 668 /// that IV, but it's error-prone to expect callers of this routine to care 669 /// about that, hence this explicit parameter. 670 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 671 const Instruction *EntryVal, 672 Value *VectorLoopValue, 673 unsigned Part, 674 unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. 696 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 697 698 /// Emit bypass checks to check any memory assumptions we may have made. 699 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 700 701 /// Compute the transformed value of Index at offset StartValue using step 702 /// StepValue. 703 /// For integer induction, returns StartValue + Index * StepValue. 704 /// For pointer induction, returns StartValue[Index * StepValue]. 705 /// FIXME: The newly created binary instructions should contain nsw/nuw 706 /// flags, which can be found from the original scalar operations. 707 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 708 const DataLayout &DL, 709 const InductionDescriptor &ID) const; 710 711 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 712 /// vector loop preheader, middle block and scalar preheader. Also 713 /// allocate a loop object for the new vector loop and return it. 714 Loop *createVectorLoopSkeleton(StringRef Prefix); 715 716 /// Create new phi nodes for the induction variables to resume iteration count 717 /// in the scalar epilogue, from where the vectorized loop left off (given by 718 /// \p VectorTripCount). 719 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 720 721 /// Complete the loop skeleton by adding debug MDs, creating appropriate 722 /// conditional branches in the middle block, preparing the builder and 723 /// running the verifier. Take in the vector loop \p L as argument, and return 724 /// the preheader of the completed vector loop. 725 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 726 727 /// Add additional metadata to \p To that was not present on \p Orig. 728 /// 729 /// Currently this is used to add the noalias annotations based on the 730 /// inserted memchecks. Use this for instructions that are *cloned* into the 731 /// vector loop. 732 void addNewMetadata(Instruction *To, const Instruction *Orig); 733 734 /// Add metadata from one instruction to another. 735 /// 736 /// This includes both the original MDs from \p From and additional ones (\see 737 /// addNewMetadata). Use this for *newly created* instructions in the vector 738 /// loop. 739 void addMetadata(Instruction *To, Instruction *From); 740 741 /// Similar to the previous function but it adds the metadata to a 742 /// vector of instructions. 743 void addMetadata(ArrayRef<Value *> To, Instruction *From); 744 745 /// The original loop. 746 Loop *OrigLoop; 747 748 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 749 /// dynamic knowledge to simplify SCEV expressions and converts them to a 750 /// more usable form. 751 PredicatedScalarEvolution &PSE; 752 753 /// Loop Info. 754 LoopInfo *LI; 755 756 /// Dominator Tree. 757 DominatorTree *DT; 758 759 /// Alias Analysis. 760 AAResults *AA; 761 762 /// Target Library Info. 763 const TargetLibraryInfo *TLI; 764 765 /// Target Transform Info. 766 const TargetTransformInfo *TTI; 767 768 /// Assumption Cache. 769 AssumptionCache *AC; 770 771 /// Interface to emit optimization remarks. 772 OptimizationRemarkEmitter *ORE; 773 774 /// LoopVersioning. It's only set up (non-null) if memchecks were 775 /// used. 776 /// 777 /// This is currently only used to add no-alias metadata based on the 778 /// memchecks. The actually versioning is performed manually. 779 std::unique_ptr<LoopVersioning> LVer; 780 781 /// The vectorization SIMD factor to use. Each vector will have this many 782 /// vector elements. 783 ElementCount VF; 784 785 /// The vectorization unroll factor to use. Each scalar is vectorized to this 786 /// many different vector instructions. 787 unsigned UF; 788 789 /// The builder that we use 790 IRBuilder<> Builder; 791 792 // --- Vectorization state --- 793 794 /// The vector-loop preheader. 795 BasicBlock *LoopVectorPreHeader; 796 797 /// The scalar-loop preheader. 798 BasicBlock *LoopScalarPreHeader; 799 800 /// Middle Block between the vector and the scalar. 801 BasicBlock *LoopMiddleBlock; 802 803 /// The ExitBlock of the scalar loop. 804 BasicBlock *LoopExitBlock; 805 806 /// The vector loop body. 807 BasicBlock *LoopVectorBody; 808 809 /// The scalar loop body. 810 BasicBlock *LoopScalarBody; 811 812 /// A list of all bypass blocks. The first block is the entry of the loop. 813 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 814 815 /// The new Induction variable which was added to the new block. 816 PHINode *Induction = nullptr; 817 818 /// The induction variable of the old basic block. 819 PHINode *OldInduction = nullptr; 820 821 /// Maps values from the original loop to their corresponding values in the 822 /// vectorized loop. A key value can map to either vector values, scalar 823 /// values or both kinds of values, depending on whether the key was 824 /// vectorized and scalarized. 825 VectorizerValueMap VectorLoopValueMap; 826 827 /// Store instructions that were predicated. 828 SmallVector<Instruction *, 4> PredicatedInstructions; 829 830 /// Trip count of the original loop. 831 Value *TripCount = nullptr; 832 833 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 834 Value *VectorTripCount = nullptr; 835 836 /// The legality analysis. 837 LoopVectorizationLegality *Legal; 838 839 /// The profitablity analysis. 840 LoopVectorizationCostModel *Cost; 841 842 // Record whether runtime checks are added. 843 bool AddedSafetyChecks = false; 844 845 // Holds the end values for each induction variable. We save the end values 846 // so we can later fix-up the external users of the induction variables. 847 DenseMap<PHINode *, Value *> IVEndValues; 848 849 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 850 // fixed up at the end of vector code generation. 851 SmallVector<PHINode *, 8> OrigPHIsToFix; 852 853 /// BFI and PSI are used to check for profile guided size optimizations. 854 BlockFrequencyInfo *BFI; 855 ProfileSummaryInfo *PSI; 856 857 // Whether this loop should be optimized for size based on profile guided size 858 // optimizatios. 859 bool OptForSizeBasedOnProfile; 860 }; 861 862 class InnerLoopUnroller : public InnerLoopVectorizer { 863 public: 864 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 865 LoopInfo *LI, DominatorTree *DT, 866 const TargetLibraryInfo *TLI, 867 const TargetTransformInfo *TTI, AssumptionCache *AC, 868 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 869 LoopVectorizationLegality *LVL, 870 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 871 ProfileSummaryInfo *PSI) 872 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 873 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 874 BFI, PSI) {} 875 876 private: 877 Value *getBroadcastInstrs(Value *V) override; 878 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 879 Instruction::BinaryOps Opcode = 880 Instruction::BinaryOpsEnd) override; 881 Value *reverseVector(Value *Vec) override; 882 }; 883 884 } // end namespace llvm 885 886 /// Look for a meaningful debug location on the instruction or it's 887 /// operands. 888 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 889 if (!I) 890 return I; 891 892 DebugLoc Empty; 893 if (I->getDebugLoc() != Empty) 894 return I; 895 896 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 897 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 898 if (OpInst->getDebugLoc() != Empty) 899 return OpInst; 900 } 901 902 return I; 903 } 904 905 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 906 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 907 const DILocation *DIL = Inst->getDebugLoc(); 908 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 909 !isa<DbgInfoIntrinsic>(Inst)) { 910 assert(!VF.isScalable() && "scalable vectors not yet supported."); 911 auto NewDIL = 912 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 913 if (NewDIL) 914 B.SetCurrentDebugLocation(NewDIL.getValue()); 915 else 916 LLVM_DEBUG(dbgs() 917 << "Failed to create new discriminator: " 918 << DIL->getFilename() << " Line: " << DIL->getLine()); 919 } 920 else 921 B.SetCurrentDebugLocation(DIL); 922 } else 923 B.SetCurrentDebugLocation(DebugLoc()); 924 } 925 926 /// Write a record \p DebugMsg about vectorization failure to the debug 927 /// output stream. If \p I is passed, it is an instruction that prevents 928 /// vectorization. 929 #ifndef NDEBUG 930 static void debugVectorizationFailure(const StringRef DebugMsg, 931 Instruction *I) { 932 dbgs() << "LV: Not vectorizing: " << DebugMsg; 933 if (I != nullptr) 934 dbgs() << " " << *I; 935 else 936 dbgs() << '.'; 937 dbgs() << '\n'; 938 } 939 #endif 940 941 /// Create an analysis remark that explains why vectorization failed 942 /// 943 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 944 /// RemarkName is the identifier for the remark. If \p I is passed it is an 945 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 946 /// the location of the remark. \return the remark object that can be 947 /// streamed to. 948 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 949 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 950 Value *CodeRegion = TheLoop->getHeader(); 951 DebugLoc DL = TheLoop->getStartLoc(); 952 953 if (I) { 954 CodeRegion = I->getParent(); 955 // If there is no debug location attached to the instruction, revert back to 956 // using the loop's. 957 if (I->getDebugLoc()) 958 DL = I->getDebugLoc(); 959 } 960 961 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 962 R << "loop not vectorized: "; 963 return R; 964 } 965 966 namespace llvm { 967 968 void reportVectorizationFailure(const StringRef DebugMsg, 969 const StringRef OREMsg, const StringRef ORETag, 970 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 971 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 972 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 973 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 974 ORETag, TheLoop, I) << OREMsg); 975 } 976 977 } // end namespace llvm 978 979 #ifndef NDEBUG 980 /// \return string containing a file name and a line # for the given loop. 981 static std::string getDebugLocString(const Loop *L) { 982 std::string Result; 983 if (L) { 984 raw_string_ostream OS(Result); 985 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 986 LoopDbgLoc.print(OS); 987 else 988 // Just print the module name. 989 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 990 OS.flush(); 991 } 992 return Result; 993 } 994 #endif 995 996 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 997 const Instruction *Orig) { 998 // If the loop was versioned with memchecks, add the corresponding no-alias 999 // metadata. 1000 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1001 LVer->annotateInstWithNoAlias(To, Orig); 1002 } 1003 1004 void InnerLoopVectorizer::addMetadata(Instruction *To, 1005 Instruction *From) { 1006 propagateMetadata(To, From); 1007 addNewMetadata(To, From); 1008 } 1009 1010 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1011 Instruction *From) { 1012 for (Value *V : To) { 1013 if (Instruction *I = dyn_cast<Instruction>(V)) 1014 addMetadata(I, From); 1015 } 1016 } 1017 1018 namespace llvm { 1019 1020 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1021 // lowered. 1022 enum ScalarEpilogueLowering { 1023 1024 // The default: allowing scalar epilogues. 1025 CM_ScalarEpilogueAllowed, 1026 1027 // Vectorization with OptForSize: don't allow epilogues. 1028 CM_ScalarEpilogueNotAllowedOptSize, 1029 1030 // A special case of vectorisation with OptForSize: loops with a very small 1031 // trip count are considered for vectorization under OptForSize, thereby 1032 // making sure the cost of their loop body is dominant, free of runtime 1033 // guards and scalar iteration overheads. 1034 CM_ScalarEpilogueNotAllowedLowTripLoop, 1035 1036 // Loop hint predicate indicating an epilogue is undesired. 1037 CM_ScalarEpilogueNotNeededUsePredicate 1038 }; 1039 1040 /// LoopVectorizationCostModel - estimates the expected speedups due to 1041 /// vectorization. 1042 /// In many cases vectorization is not profitable. This can happen because of 1043 /// a number of reasons. In this class we mainly attempt to predict the 1044 /// expected speedup/slowdowns due to the supported instruction set. We use the 1045 /// TargetTransformInfo to query the different backends for the cost of 1046 /// different operations. 1047 class LoopVectorizationCostModel { 1048 public: 1049 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1050 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1051 LoopVectorizationLegality *Legal, 1052 const TargetTransformInfo &TTI, 1053 const TargetLibraryInfo *TLI, DemandedBits *DB, 1054 AssumptionCache *AC, 1055 OptimizationRemarkEmitter *ORE, const Function *F, 1056 const LoopVectorizeHints *Hints, 1057 InterleavedAccessInfo &IAI) 1058 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1059 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1060 Hints(Hints), InterleaveInfo(IAI) {} 1061 1062 /// \return An upper bound for the vectorization factor, or None if 1063 /// vectorization and interleaving should be avoided up front. 1064 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1065 1066 /// \return True if runtime checks are required for vectorization, and false 1067 /// otherwise. 1068 bool runtimeChecksRequired(); 1069 1070 /// \return The most profitable vectorization factor and the cost of that VF. 1071 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1072 /// then this vectorization factor will be selected if vectorization is 1073 /// possible. 1074 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1075 1076 /// Setup cost-based decisions for user vectorization factor. 1077 void selectUserVectorizationFactor(ElementCount UserVF) { 1078 collectUniformsAndScalars(UserVF); 1079 collectInstsToScalarize(UserVF); 1080 } 1081 1082 /// \return The size (in bits) of the smallest and widest types in the code 1083 /// that needs to be vectorized. We ignore values that remain scalar such as 1084 /// 64 bit loop indices. 1085 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1086 1087 /// \return The desired interleave count. 1088 /// If interleave count has been specified by metadata it will be returned. 1089 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1090 /// are the selected vectorization factor and the cost of the selected VF. 1091 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1092 1093 /// Memory access instruction may be vectorized in more than one way. 1094 /// Form of instruction after vectorization depends on cost. 1095 /// This function takes cost-based decisions for Load/Store instructions 1096 /// and collects them in a map. This decisions map is used for building 1097 /// the lists of loop-uniform and loop-scalar instructions. 1098 /// The calculated cost is saved with widening decision in order to 1099 /// avoid redundant calculations. 1100 void setCostBasedWideningDecision(ElementCount VF); 1101 1102 /// A struct that represents some properties of the register usage 1103 /// of a loop. 1104 struct RegisterUsage { 1105 /// Holds the number of loop invariant values that are used in the loop. 1106 /// The key is ClassID of target-provided register class. 1107 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1108 /// Holds the maximum number of concurrent live intervals in the loop. 1109 /// The key is ClassID of target-provided register class. 1110 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1111 }; 1112 1113 /// \return Returns information about the register usages of the loop for the 1114 /// given vectorization factors. 1115 SmallVector<RegisterUsage, 8> 1116 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1117 1118 /// Collect values we want to ignore in the cost model. 1119 void collectValuesToIgnore(); 1120 1121 /// Split reductions into those that happen in the loop, and those that happen 1122 /// outside. In loop reductions are collected into InLoopReductionChains. 1123 void collectInLoopReductions(); 1124 1125 /// \returns The smallest bitwidth each instruction can be represented with. 1126 /// The vector equivalents of these instructions should be truncated to this 1127 /// type. 1128 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1129 return MinBWs; 1130 } 1131 1132 /// \returns True if it is more profitable to scalarize instruction \p I for 1133 /// vectorization factor \p VF. 1134 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1135 assert(VF.isVector() && 1136 "Profitable to scalarize relevant only for VF > 1."); 1137 1138 // Cost model is not run in the VPlan-native path - return conservative 1139 // result until this changes. 1140 if (EnableVPlanNativePath) 1141 return false; 1142 1143 auto Scalars = InstsToScalarize.find(VF); 1144 assert(Scalars != InstsToScalarize.end() && 1145 "VF not yet analyzed for scalarization profitability"); 1146 return Scalars->second.find(I) != Scalars->second.end(); 1147 } 1148 1149 /// Returns true if \p I is known to be uniform after vectorization. 1150 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1151 if (VF.isScalar()) 1152 return true; 1153 1154 // Cost model is not run in the VPlan-native path - return conservative 1155 // result until this changes. 1156 if (EnableVPlanNativePath) 1157 return false; 1158 1159 auto UniformsPerVF = Uniforms.find(VF); 1160 assert(UniformsPerVF != Uniforms.end() && 1161 "VF not yet analyzed for uniformity"); 1162 return UniformsPerVF->second.count(I); 1163 } 1164 1165 /// Returns true if \p I is known to be scalar after vectorization. 1166 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1167 if (VF.isScalar()) 1168 return true; 1169 1170 // Cost model is not run in the VPlan-native path - return conservative 1171 // result until this changes. 1172 if (EnableVPlanNativePath) 1173 return false; 1174 1175 auto ScalarsPerVF = Scalars.find(VF); 1176 assert(ScalarsPerVF != Scalars.end() && 1177 "Scalar values are not calculated for VF"); 1178 return ScalarsPerVF->second.count(I); 1179 } 1180 1181 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1182 /// for vectorization factor \p VF. 1183 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1184 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1185 !isProfitableToScalarize(I, VF) && 1186 !isScalarAfterVectorization(I, VF); 1187 } 1188 1189 /// Decision that was taken during cost calculation for memory instruction. 1190 enum InstWidening { 1191 CM_Unknown, 1192 CM_Widen, // For consecutive accesses with stride +1. 1193 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1194 CM_Interleave, 1195 CM_GatherScatter, 1196 CM_Scalarize 1197 }; 1198 1199 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1200 /// instruction \p I and vector width \p VF. 1201 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1202 unsigned Cost) { 1203 assert(VF.isVector() && "Expected VF >=2"); 1204 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1205 } 1206 1207 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1208 /// interleaving group \p Grp and vector width \p VF. 1209 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1210 ElementCount VF, InstWidening W, unsigned Cost) { 1211 assert(VF.isVector() && "Expected VF >=2"); 1212 /// Broadcast this decicion to all instructions inside the group. 1213 /// But the cost will be assigned to one instruction only. 1214 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1215 if (auto *I = Grp->getMember(i)) { 1216 if (Grp->getInsertPos() == I) 1217 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1218 else 1219 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1220 } 1221 } 1222 } 1223 1224 /// Return the cost model decision for the given instruction \p I and vector 1225 /// width \p VF. Return CM_Unknown if this instruction did not pass 1226 /// through the cost modeling. 1227 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1228 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1229 assert(VF.isVector() && "Expected VF >=2"); 1230 1231 // Cost model is not run in the VPlan-native path - return conservative 1232 // result until this changes. 1233 if (EnableVPlanNativePath) 1234 return CM_GatherScatter; 1235 1236 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1237 auto Itr = WideningDecisions.find(InstOnVF); 1238 if (Itr == WideningDecisions.end()) 1239 return CM_Unknown; 1240 return Itr->second.first; 1241 } 1242 1243 /// Return the vectorization cost for the given instruction \p I and vector 1244 /// width \p VF. 1245 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1246 assert(VF.isVector() && "Expected VF >=2"); 1247 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1248 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1249 "The cost is not calculated"); 1250 return WideningDecisions[InstOnVF].second; 1251 } 1252 1253 /// Return True if instruction \p I is an optimizable truncate whose operand 1254 /// is an induction variable. Such a truncate will be removed by adding a new 1255 /// induction variable with the destination type. 1256 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1257 // If the instruction is not a truncate, return false. 1258 auto *Trunc = dyn_cast<TruncInst>(I); 1259 if (!Trunc) 1260 return false; 1261 1262 // Get the source and destination types of the truncate. 1263 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1264 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1265 1266 // If the truncate is free for the given types, return false. Replacing a 1267 // free truncate with an induction variable would add an induction variable 1268 // update instruction to each iteration of the loop. We exclude from this 1269 // check the primary induction variable since it will need an update 1270 // instruction regardless. 1271 Value *Op = Trunc->getOperand(0); 1272 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1273 return false; 1274 1275 // If the truncated value is not an induction variable, return false. 1276 return Legal->isInductionPhi(Op); 1277 } 1278 1279 /// Collects the instructions to scalarize for each predicated instruction in 1280 /// the loop. 1281 void collectInstsToScalarize(ElementCount VF); 1282 1283 /// Collect Uniform and Scalar values for the given \p VF. 1284 /// The sets depend on CM decision for Load/Store instructions 1285 /// that may be vectorized as interleave, gather-scatter or scalarized. 1286 void collectUniformsAndScalars(ElementCount VF) { 1287 // Do the analysis once. 1288 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1289 return; 1290 setCostBasedWideningDecision(VF); 1291 collectLoopUniforms(VF); 1292 collectLoopScalars(VF); 1293 } 1294 1295 /// Returns true if the target machine supports masked store operation 1296 /// for the given \p DataType and kind of access to \p Ptr. 1297 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1298 return Legal->isConsecutivePtr(Ptr) && 1299 TTI.isLegalMaskedStore(DataType, Alignment); 1300 } 1301 1302 /// Returns true if the target machine supports masked load operation 1303 /// for the given \p DataType and kind of access to \p Ptr. 1304 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1305 return Legal->isConsecutivePtr(Ptr) && 1306 TTI.isLegalMaskedLoad(DataType, Alignment); 1307 } 1308 1309 /// Returns true if the target machine supports masked scatter operation 1310 /// for the given \p DataType. 1311 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1312 return TTI.isLegalMaskedScatter(DataType, Alignment); 1313 } 1314 1315 /// Returns true if the target machine supports masked gather operation 1316 /// for the given \p DataType. 1317 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1318 return TTI.isLegalMaskedGather(DataType, Alignment); 1319 } 1320 1321 /// Returns true if the target machine can represent \p V as a masked gather 1322 /// or scatter operation. 1323 bool isLegalGatherOrScatter(Value *V) { 1324 bool LI = isa<LoadInst>(V); 1325 bool SI = isa<StoreInst>(V); 1326 if (!LI && !SI) 1327 return false; 1328 auto *Ty = getMemInstValueType(V); 1329 Align Align = getLoadStoreAlignment(V); 1330 return (LI && isLegalMaskedGather(Ty, Align)) || 1331 (SI && isLegalMaskedScatter(Ty, Align)); 1332 } 1333 1334 /// Returns true if \p I is an instruction that will be scalarized with 1335 /// predication. Such instructions include conditional stores and 1336 /// instructions that may divide by zero. 1337 /// If a non-zero VF has been calculated, we check if I will be scalarized 1338 /// predication for that VF. 1339 bool isScalarWithPredication(Instruction *I, 1340 ElementCount VF = ElementCount::getFixed(1)); 1341 1342 // Returns true if \p I is an instruction that will be predicated either 1343 // through scalar predication or masked load/store or masked gather/scatter. 1344 // Superset of instructions that return true for isScalarWithPredication. 1345 bool isPredicatedInst(Instruction *I) { 1346 if (!blockNeedsPredication(I->getParent())) 1347 return false; 1348 // Loads and stores that need some form of masked operation are predicated 1349 // instructions. 1350 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1351 return Legal->isMaskRequired(I); 1352 return isScalarWithPredication(I); 1353 } 1354 1355 /// Returns true if \p I is a memory instruction with consecutive memory 1356 /// access that can be widened. 1357 bool 1358 memoryInstructionCanBeWidened(Instruction *I, 1359 ElementCount VF = ElementCount::getFixed(1)); 1360 1361 /// Returns true if \p I is a memory instruction in an interleaved-group 1362 /// of memory accesses that can be vectorized with wide vector loads/stores 1363 /// and shuffles. 1364 bool 1365 interleavedAccessCanBeWidened(Instruction *I, 1366 ElementCount VF = ElementCount::getFixed(1)); 1367 1368 /// Check if \p Instr belongs to any interleaved access group. 1369 bool isAccessInterleaved(Instruction *Instr) { 1370 return InterleaveInfo.isInterleaved(Instr); 1371 } 1372 1373 /// Get the interleaved access group that \p Instr belongs to. 1374 const InterleaveGroup<Instruction> * 1375 getInterleavedAccessGroup(Instruction *Instr) { 1376 return InterleaveInfo.getInterleaveGroup(Instr); 1377 } 1378 1379 /// Returns true if an interleaved group requires a scalar iteration 1380 /// to handle accesses with gaps, and there is nothing preventing us from 1381 /// creating a scalar epilogue. 1382 bool requiresScalarEpilogue() const { 1383 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1384 } 1385 1386 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1387 /// loop hint annotation. 1388 bool isScalarEpilogueAllowed() const { 1389 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1390 } 1391 1392 /// Returns true if all loop blocks should be masked to fold tail loop. 1393 bool foldTailByMasking() const { return FoldTailByMasking; } 1394 1395 bool blockNeedsPredication(BasicBlock *BB) { 1396 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1397 } 1398 1399 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1400 /// nodes to the chain of instructions representing the reductions. Uses a 1401 /// MapVector to ensure deterministic iteration order. 1402 using ReductionChainMap = 1403 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1404 1405 /// Return the chain of instructions representing an inloop reduction. 1406 const ReductionChainMap &getInLoopReductionChains() const { 1407 return InLoopReductionChains; 1408 } 1409 1410 /// Returns true if the Phi is part of an inloop reduction. 1411 bool isInLoopReduction(PHINode *Phi) const { 1412 return InLoopReductionChains.count(Phi); 1413 } 1414 1415 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1416 /// with factor VF. Return the cost of the instruction, including 1417 /// scalarization overhead if it's needed. 1418 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1419 1420 /// Estimate cost of a call instruction CI if it were vectorized with factor 1421 /// VF. Return the cost of the instruction, including scalarization overhead 1422 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1423 /// scalarized - 1424 /// i.e. either vector version isn't available, or is too expensive. 1425 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1426 bool &NeedToScalarize); 1427 1428 /// Invalidates decisions already taken by the cost model. 1429 void invalidateCostModelingDecisions() { 1430 WideningDecisions.clear(); 1431 Uniforms.clear(); 1432 Scalars.clear(); 1433 } 1434 1435 private: 1436 unsigned NumPredStores = 0; 1437 1438 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1439 /// than zero. One is returned if vectorization should best be avoided due 1440 /// to cost. 1441 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1442 1443 /// The vectorization cost is a combination of the cost itself and a boolean 1444 /// indicating whether any of the contributing operations will actually 1445 /// operate on 1446 /// vector values after type legalization in the backend. If this latter value 1447 /// is 1448 /// false, then all operations will be scalarized (i.e. no vectorization has 1449 /// actually taken place). 1450 using VectorizationCostTy = std::pair<unsigned, bool>; 1451 1452 /// Returns the expected execution cost. The unit of the cost does 1453 /// not matter because we use the 'cost' units to compare different 1454 /// vector widths. The cost that is returned is *not* normalized by 1455 /// the factor width. 1456 VectorizationCostTy expectedCost(ElementCount VF); 1457 1458 /// Returns the execution time cost of an instruction for a given vector 1459 /// width. Vector width of one means scalar. 1460 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1461 1462 /// The cost-computation logic from getInstructionCost which provides 1463 /// the vector type as an output parameter. 1464 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1465 1466 /// Calculate vectorization cost of memory instruction \p I. 1467 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1468 1469 /// The cost computation for scalarized memory instruction. 1470 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1471 1472 /// The cost computation for interleaving group of memory instructions. 1473 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1474 1475 /// The cost computation for Gather/Scatter instruction. 1476 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1477 1478 /// The cost computation for widening instruction \p I with consecutive 1479 /// memory access. 1480 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1481 1482 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1483 /// Load: scalar load + broadcast. 1484 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1485 /// element) 1486 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1487 1488 /// Estimate the overhead of scalarizing an instruction. This is a 1489 /// convenience wrapper for the type-based getScalarizationOverhead API. 1490 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1491 1492 /// Returns whether the instruction is a load or store and will be a emitted 1493 /// as a vector operation. 1494 bool isConsecutiveLoadOrStore(Instruction *I); 1495 1496 /// Returns true if an artificially high cost for emulated masked memrefs 1497 /// should be used. 1498 bool useEmulatedMaskMemRefHack(Instruction *I); 1499 1500 /// Map of scalar integer values to the smallest bitwidth they can be legally 1501 /// represented as. The vector equivalents of these values should be truncated 1502 /// to this type. 1503 MapVector<Instruction *, uint64_t> MinBWs; 1504 1505 /// A type representing the costs for instructions if they were to be 1506 /// scalarized rather than vectorized. The entries are Instruction-Cost 1507 /// pairs. 1508 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1509 1510 /// A set containing all BasicBlocks that are known to present after 1511 /// vectorization as a predicated block. 1512 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1513 1514 /// Records whether it is allowed to have the original scalar loop execute at 1515 /// least once. This may be needed as a fallback loop in case runtime 1516 /// aliasing/dependence checks fail, or to handle the tail/remainder 1517 /// iterations when the trip count is unknown or doesn't divide by the VF, 1518 /// or as a peel-loop to handle gaps in interleave-groups. 1519 /// Under optsize and when the trip count is very small we don't allow any 1520 /// iterations to execute in the scalar loop. 1521 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1522 1523 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1524 bool FoldTailByMasking = false; 1525 1526 /// A map holding scalar costs for different vectorization factors. The 1527 /// presence of a cost for an instruction in the mapping indicates that the 1528 /// instruction will be scalarized when vectorizing with the associated 1529 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1530 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1531 1532 /// Holds the instructions known to be uniform after vectorization. 1533 /// The data is collected per VF. 1534 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1535 1536 /// Holds the instructions known to be scalar after vectorization. 1537 /// The data is collected per VF. 1538 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1539 1540 /// Holds the instructions (address computations) that are forced to be 1541 /// scalarized. 1542 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1543 1544 /// PHINodes of the reductions that should be expanded in-loop along with 1545 /// their associated chains of reduction operations, in program order from top 1546 /// (PHI) to bottom 1547 ReductionChainMap InLoopReductionChains; 1548 1549 /// Returns the expected difference in cost from scalarizing the expression 1550 /// feeding a predicated instruction \p PredInst. The instructions to 1551 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1552 /// non-negative return value implies the expression will be scalarized. 1553 /// Currently, only single-use chains are considered for scalarization. 1554 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1555 ElementCount VF); 1556 1557 /// Collect the instructions that are uniform after vectorization. An 1558 /// instruction is uniform if we represent it with a single scalar value in 1559 /// the vectorized loop corresponding to each vector iteration. Examples of 1560 /// uniform instructions include pointer operands of consecutive or 1561 /// interleaved memory accesses. Note that although uniformity implies an 1562 /// instruction will be scalar, the reverse is not true. In general, a 1563 /// scalarized instruction will be represented by VF scalar values in the 1564 /// vectorized loop, each corresponding to an iteration of the original 1565 /// scalar loop. 1566 void collectLoopUniforms(ElementCount VF); 1567 1568 /// Collect the instructions that are scalar after vectorization. An 1569 /// instruction is scalar if it is known to be uniform or will be scalarized 1570 /// during vectorization. Non-uniform scalarized instructions will be 1571 /// represented by VF values in the vectorized loop, each corresponding to an 1572 /// iteration of the original scalar loop. 1573 void collectLoopScalars(ElementCount VF); 1574 1575 /// Keeps cost model vectorization decision and cost for instructions. 1576 /// Right now it is used for memory instructions only. 1577 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1578 std::pair<InstWidening, unsigned>>; 1579 1580 DecisionList WideningDecisions; 1581 1582 /// Returns true if \p V is expected to be vectorized and it needs to be 1583 /// extracted. 1584 bool needsExtract(Value *V, ElementCount VF) const { 1585 Instruction *I = dyn_cast<Instruction>(V); 1586 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1587 TheLoop->isLoopInvariant(I)) 1588 return false; 1589 1590 // Assume we can vectorize V (and hence we need extraction) if the 1591 // scalars are not computed yet. This can happen, because it is called 1592 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1593 // the scalars are collected. That should be a safe assumption in most 1594 // cases, because we check if the operands have vectorizable types 1595 // beforehand in LoopVectorizationLegality. 1596 return Scalars.find(VF) == Scalars.end() || 1597 !isScalarAfterVectorization(I, VF); 1598 }; 1599 1600 /// Returns a range containing only operands needing to be extracted. 1601 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1602 ElementCount VF) { 1603 return SmallVector<Value *, 4>(make_filter_range( 1604 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1605 } 1606 1607 public: 1608 /// The loop that we evaluate. 1609 Loop *TheLoop; 1610 1611 /// Predicated scalar evolution analysis. 1612 PredicatedScalarEvolution &PSE; 1613 1614 /// Loop Info analysis. 1615 LoopInfo *LI; 1616 1617 /// Vectorization legality. 1618 LoopVectorizationLegality *Legal; 1619 1620 /// Vector target information. 1621 const TargetTransformInfo &TTI; 1622 1623 /// Target Library Info. 1624 const TargetLibraryInfo *TLI; 1625 1626 /// Demanded bits analysis. 1627 DemandedBits *DB; 1628 1629 /// Assumption cache. 1630 AssumptionCache *AC; 1631 1632 /// Interface to emit optimization remarks. 1633 OptimizationRemarkEmitter *ORE; 1634 1635 const Function *TheFunction; 1636 1637 /// Loop Vectorize Hint. 1638 const LoopVectorizeHints *Hints; 1639 1640 /// The interleave access information contains groups of interleaved accesses 1641 /// with the same stride and close to each other. 1642 InterleavedAccessInfo &InterleaveInfo; 1643 1644 /// Values to ignore in the cost model. 1645 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1646 1647 /// Values to ignore in the cost model when VF > 1. 1648 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1649 }; 1650 1651 } // end namespace llvm 1652 1653 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1654 // vectorization. The loop needs to be annotated with #pragma omp simd 1655 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1656 // vector length information is not provided, vectorization is not considered 1657 // explicit. Interleave hints are not allowed either. These limitations will be 1658 // relaxed in the future. 1659 // Please, note that we are currently forced to abuse the pragma 'clang 1660 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1661 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1662 // provides *explicit vectorization hints* (LV can bypass legal checks and 1663 // assume that vectorization is legal). However, both hints are implemented 1664 // using the same metadata (llvm.loop.vectorize, processed by 1665 // LoopVectorizeHints). This will be fixed in the future when the native IR 1666 // representation for pragma 'omp simd' is introduced. 1667 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1668 OptimizationRemarkEmitter *ORE) { 1669 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1670 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1671 1672 // Only outer loops with an explicit vectorization hint are supported. 1673 // Unannotated outer loops are ignored. 1674 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1675 return false; 1676 1677 Function *Fn = OuterLp->getHeader()->getParent(); 1678 if (!Hints.allowVectorization(Fn, OuterLp, 1679 true /*VectorizeOnlyWhenForced*/)) { 1680 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1681 return false; 1682 } 1683 1684 if (Hints.getInterleave() > 1) { 1685 // TODO: Interleave support is future work. 1686 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1687 "outer loops.\n"); 1688 Hints.emitRemarkWithHints(); 1689 return false; 1690 } 1691 1692 return true; 1693 } 1694 1695 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1696 OptimizationRemarkEmitter *ORE, 1697 SmallVectorImpl<Loop *> &V) { 1698 // Collect inner loops and outer loops without irreducible control flow. For 1699 // now, only collect outer loops that have explicit vectorization hints. If we 1700 // are stress testing the VPlan H-CFG construction, we collect the outermost 1701 // loop of every loop nest. 1702 if (L.isInnermost() || VPlanBuildStressTest || 1703 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1704 LoopBlocksRPO RPOT(&L); 1705 RPOT.perform(LI); 1706 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1707 V.push_back(&L); 1708 // TODO: Collect inner loops inside marked outer loops in case 1709 // vectorization fails for the outer loop. Do not invoke 1710 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1711 // already known to be reducible. We can use an inherited attribute for 1712 // that. 1713 return; 1714 } 1715 } 1716 for (Loop *InnerL : L) 1717 collectSupportedLoops(*InnerL, LI, ORE, V); 1718 } 1719 1720 namespace { 1721 1722 /// The LoopVectorize Pass. 1723 struct LoopVectorize : public FunctionPass { 1724 /// Pass identification, replacement for typeid 1725 static char ID; 1726 1727 LoopVectorizePass Impl; 1728 1729 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1730 bool VectorizeOnlyWhenForced = false) 1731 : FunctionPass(ID), 1732 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1733 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1734 } 1735 1736 bool runOnFunction(Function &F) override { 1737 if (skipFunction(F)) 1738 return false; 1739 1740 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1741 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1742 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1743 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1744 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1745 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1746 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1747 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1748 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1749 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1750 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1751 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1752 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1753 1754 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1755 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1756 1757 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1758 GetLAA, *ORE, PSI).MadeAnyChange; 1759 } 1760 1761 void getAnalysisUsage(AnalysisUsage &AU) const override { 1762 AU.addRequired<AssumptionCacheTracker>(); 1763 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1764 AU.addRequired<DominatorTreeWrapperPass>(); 1765 AU.addRequired<LoopInfoWrapperPass>(); 1766 AU.addRequired<ScalarEvolutionWrapperPass>(); 1767 AU.addRequired<TargetTransformInfoWrapperPass>(); 1768 AU.addRequired<AAResultsWrapperPass>(); 1769 AU.addRequired<LoopAccessLegacyAnalysis>(); 1770 AU.addRequired<DemandedBitsWrapperPass>(); 1771 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1772 AU.addRequired<InjectTLIMappingsLegacy>(); 1773 1774 // We currently do not preserve loopinfo/dominator analyses with outer loop 1775 // vectorization. Until this is addressed, mark these analyses as preserved 1776 // only for non-VPlan-native path. 1777 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1778 if (!EnableVPlanNativePath) { 1779 AU.addPreserved<LoopInfoWrapperPass>(); 1780 AU.addPreserved<DominatorTreeWrapperPass>(); 1781 } 1782 1783 AU.addPreserved<BasicAAWrapperPass>(); 1784 AU.addPreserved<GlobalsAAWrapperPass>(); 1785 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1786 } 1787 }; 1788 1789 } // end anonymous namespace 1790 1791 //===----------------------------------------------------------------------===// 1792 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1793 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1794 //===----------------------------------------------------------------------===// 1795 1796 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1797 // We need to place the broadcast of invariant variables outside the loop, 1798 // but only if it's proven safe to do so. Else, broadcast will be inside 1799 // vector loop body. 1800 Instruction *Instr = dyn_cast<Instruction>(V); 1801 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1802 (!Instr || 1803 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1804 // Place the code for broadcasting invariant variables in the new preheader. 1805 IRBuilder<>::InsertPointGuard Guard(Builder); 1806 if (SafeToHoist) 1807 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1808 1809 // Broadcast the scalar into all locations in the vector. 1810 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1811 1812 return Shuf; 1813 } 1814 1815 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1816 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1817 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1818 "Expected either an induction phi-node or a truncate of it!"); 1819 Value *Start = II.getStartValue(); 1820 1821 // Construct the initial value of the vector IV in the vector loop preheader 1822 auto CurrIP = Builder.saveIP(); 1823 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1824 if (isa<TruncInst>(EntryVal)) { 1825 assert(Start->getType()->isIntegerTy() && 1826 "Truncation requires an integer type"); 1827 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1828 Step = Builder.CreateTrunc(Step, TruncType); 1829 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1830 } 1831 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1832 Value *SteppedStart = 1833 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1834 1835 // We create vector phi nodes for both integer and floating-point induction 1836 // variables. Here, we determine the kind of arithmetic we will perform. 1837 Instruction::BinaryOps AddOp; 1838 Instruction::BinaryOps MulOp; 1839 if (Step->getType()->isIntegerTy()) { 1840 AddOp = Instruction::Add; 1841 MulOp = Instruction::Mul; 1842 } else { 1843 AddOp = II.getInductionOpcode(); 1844 MulOp = Instruction::FMul; 1845 } 1846 1847 // Multiply the vectorization factor by the step using integer or 1848 // floating-point arithmetic as appropriate. 1849 Value *ConstVF = 1850 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1851 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1852 1853 // Create a vector splat to use in the induction update. 1854 // 1855 // FIXME: If the step is non-constant, we create the vector splat with 1856 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1857 // handle a constant vector splat. 1858 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1859 Value *SplatVF = isa<Constant>(Mul) 1860 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1861 : Builder.CreateVectorSplat(VF, Mul); 1862 Builder.restoreIP(CurrIP); 1863 1864 // We may need to add the step a number of times, depending on the unroll 1865 // factor. The last of those goes into the PHI. 1866 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1867 &*LoopVectorBody->getFirstInsertionPt()); 1868 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1869 Instruction *LastInduction = VecInd; 1870 for (unsigned Part = 0; Part < UF; ++Part) { 1871 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1872 1873 if (isa<TruncInst>(EntryVal)) 1874 addMetadata(LastInduction, EntryVal); 1875 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1876 1877 LastInduction = cast<Instruction>(addFastMathFlag( 1878 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1879 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1880 } 1881 1882 // Move the last step to the end of the latch block. This ensures consistent 1883 // placement of all induction updates. 1884 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1885 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1886 auto *ICmp = cast<Instruction>(Br->getCondition()); 1887 LastInduction->moveBefore(ICmp); 1888 LastInduction->setName("vec.ind.next"); 1889 1890 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1891 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1892 } 1893 1894 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1895 return Cost->isScalarAfterVectorization(I, VF) || 1896 Cost->isProfitableToScalarize(I, VF); 1897 } 1898 1899 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1900 if (shouldScalarizeInstruction(IV)) 1901 return true; 1902 auto isScalarInst = [&](User *U) -> bool { 1903 auto *I = cast<Instruction>(U); 1904 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1905 }; 1906 return llvm::any_of(IV->users(), isScalarInst); 1907 } 1908 1909 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1910 const InductionDescriptor &ID, const Instruction *EntryVal, 1911 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1912 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1913 "Expected either an induction phi-node or a truncate of it!"); 1914 1915 // This induction variable is not the phi from the original loop but the 1916 // newly-created IV based on the proof that casted Phi is equal to the 1917 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1918 // re-uses the same InductionDescriptor that original IV uses but we don't 1919 // have to do any recording in this case - that is done when original IV is 1920 // processed. 1921 if (isa<TruncInst>(EntryVal)) 1922 return; 1923 1924 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1925 if (Casts.empty()) 1926 return; 1927 // Only the first Cast instruction in the Casts vector is of interest. 1928 // The rest of the Casts (if exist) have no uses outside the 1929 // induction update chain itself. 1930 Instruction *CastInst = *Casts.begin(); 1931 if (Lane < UINT_MAX) 1932 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1933 else 1934 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1935 } 1936 1937 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1938 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1939 "Primary induction variable must have an integer type"); 1940 1941 auto II = Legal->getInductionVars().find(IV); 1942 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1943 1944 auto ID = II->second; 1945 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1946 1947 // The value from the original loop to which we are mapping the new induction 1948 // variable. 1949 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1950 1951 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1952 1953 // Generate code for the induction step. Note that induction steps are 1954 // required to be loop-invariant 1955 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1956 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1957 "Induction step should be loop invariant"); 1958 if (PSE.getSE()->isSCEVable(IV->getType())) { 1959 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1960 return Exp.expandCodeFor(Step, Step->getType(), 1961 LoopVectorPreHeader->getTerminator()); 1962 } 1963 return cast<SCEVUnknown>(Step)->getValue(); 1964 }; 1965 1966 // The scalar value to broadcast. This is derived from the canonical 1967 // induction variable. If a truncation type is given, truncate the canonical 1968 // induction variable and step. Otherwise, derive these values from the 1969 // induction descriptor. 1970 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1971 Value *ScalarIV = Induction; 1972 if (IV != OldInduction) { 1973 ScalarIV = IV->getType()->isIntegerTy() 1974 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1975 : Builder.CreateCast(Instruction::SIToFP, Induction, 1976 IV->getType()); 1977 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1978 ScalarIV->setName("offset.idx"); 1979 } 1980 if (Trunc) { 1981 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1982 assert(Step->getType()->isIntegerTy() && 1983 "Truncation requires an integer step"); 1984 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1985 Step = Builder.CreateTrunc(Step, TruncType); 1986 } 1987 return ScalarIV; 1988 }; 1989 1990 // Create the vector values from the scalar IV, in the absence of creating a 1991 // vector IV. 1992 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1993 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1994 for (unsigned Part = 0; Part < UF; ++Part) { 1995 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1996 Value *EntryPart = 1997 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1998 ID.getInductionOpcode()); 1999 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2000 if (Trunc) 2001 addMetadata(EntryPart, Trunc); 2002 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2003 } 2004 }; 2005 2006 // Now do the actual transformations, and start with creating the step value. 2007 Value *Step = CreateStepValue(ID.getStep()); 2008 if (VF.isZero() || VF.isScalar()) { 2009 Value *ScalarIV = CreateScalarIV(Step); 2010 CreateSplatIV(ScalarIV, Step); 2011 return; 2012 } 2013 2014 // Determine if we want a scalar version of the induction variable. This is 2015 // true if the induction variable itself is not widened, or if it has at 2016 // least one user in the loop that is not widened. 2017 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2018 if (!NeedsScalarIV) { 2019 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2020 return; 2021 } 2022 2023 // Try to create a new independent vector induction variable. If we can't 2024 // create the phi node, we will splat the scalar induction variable in each 2025 // loop iteration. 2026 if (!shouldScalarizeInstruction(EntryVal)) { 2027 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2028 Value *ScalarIV = CreateScalarIV(Step); 2029 // Create scalar steps that can be used by instructions we will later 2030 // scalarize. Note that the addition of the scalar steps will not increase 2031 // the number of instructions in the loop in the common case prior to 2032 // InstCombine. We will be trading one vector extract for each scalar step. 2033 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2034 return; 2035 } 2036 2037 // All IV users are scalar instructions, so only emit a scalar IV, not a 2038 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2039 // predicate used by the masked loads/stores. 2040 Value *ScalarIV = CreateScalarIV(Step); 2041 if (!Cost->isScalarEpilogueAllowed()) 2042 CreateSplatIV(ScalarIV, Step); 2043 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2044 } 2045 2046 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2047 Instruction::BinaryOps BinOp) { 2048 // Create and check the types. 2049 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2050 int VLen = ValVTy->getNumElements(); 2051 2052 Type *STy = Val->getType()->getScalarType(); 2053 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2054 "Induction Step must be an integer or FP"); 2055 assert(Step->getType() == STy && "Step has wrong type"); 2056 2057 SmallVector<Constant *, 8> Indices; 2058 2059 if (STy->isIntegerTy()) { 2060 // Create a vector of consecutive numbers from zero to VF. 2061 for (int i = 0; i < VLen; ++i) 2062 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2063 2064 // Add the consecutive indices to the vector value. 2065 Constant *Cv = ConstantVector::get(Indices); 2066 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2067 Step = Builder.CreateVectorSplat(VLen, Step); 2068 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2069 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2070 // which can be found from the original scalar operations. 2071 Step = Builder.CreateMul(Cv, Step); 2072 return Builder.CreateAdd(Val, Step, "induction"); 2073 } 2074 2075 // Floating point induction. 2076 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2077 "Binary Opcode should be specified for FP induction"); 2078 // Create a vector of consecutive numbers from zero to VF. 2079 for (int i = 0; i < VLen; ++i) 2080 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2081 2082 // Add the consecutive indices to the vector value. 2083 Constant *Cv = ConstantVector::get(Indices); 2084 2085 Step = Builder.CreateVectorSplat(VLen, Step); 2086 2087 // Floating point operations had to be 'fast' to enable the induction. 2088 FastMathFlags Flags; 2089 Flags.setFast(); 2090 2091 Value *MulOp = Builder.CreateFMul(Cv, Step); 2092 if (isa<Instruction>(MulOp)) 2093 // Have to check, MulOp may be a constant 2094 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2095 2096 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2097 if (isa<Instruction>(BOp)) 2098 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2099 return BOp; 2100 } 2101 2102 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2103 Instruction *EntryVal, 2104 const InductionDescriptor &ID) { 2105 // We shouldn't have to build scalar steps if we aren't vectorizing. 2106 assert(VF.isVector() && "VF should be greater than one"); 2107 assert(!VF.isScalable() && 2108 "the code below assumes a fixed number of elements at compile time"); 2109 // Get the value type and ensure it and the step have the same integer type. 2110 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2111 assert(ScalarIVTy == Step->getType() && 2112 "Val and Step should have the same type"); 2113 2114 // We build scalar steps for both integer and floating-point induction 2115 // variables. Here, we determine the kind of arithmetic we will perform. 2116 Instruction::BinaryOps AddOp; 2117 Instruction::BinaryOps MulOp; 2118 if (ScalarIVTy->isIntegerTy()) { 2119 AddOp = Instruction::Add; 2120 MulOp = Instruction::Mul; 2121 } else { 2122 AddOp = ID.getInductionOpcode(); 2123 MulOp = Instruction::FMul; 2124 } 2125 2126 // Determine the number of scalars we need to generate for each unroll 2127 // iteration. If EntryVal is uniform, we only need to generate the first 2128 // lane. Otherwise, we generate all VF values. 2129 unsigned Lanes = 2130 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2131 ? 1 2132 : VF.getKnownMinValue(); 2133 // Compute the scalar steps and save the results in VectorLoopValueMap. 2134 for (unsigned Part = 0; Part < UF; ++Part) { 2135 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2136 auto *StartIdx = getSignedIntOrFpConstant( 2137 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2138 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2139 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2140 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2141 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2142 } 2143 } 2144 } 2145 2146 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2147 assert(V != Induction && "The new induction variable should not be used."); 2148 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2149 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2150 2151 // If we have a stride that is replaced by one, do it here. Defer this for 2152 // the VPlan-native path until we start running Legal checks in that path. 2153 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2154 V = ConstantInt::get(V->getType(), 1); 2155 2156 // If we have a vector mapped to this value, return it. 2157 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2158 return VectorLoopValueMap.getVectorValue(V, Part); 2159 2160 // If the value has not been vectorized, check if it has been scalarized 2161 // instead. If it has been scalarized, and we actually need the value in 2162 // vector form, we will construct the vector values on demand. 2163 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2164 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2165 2166 // If we've scalarized a value, that value should be an instruction. 2167 auto *I = cast<Instruction>(V); 2168 2169 // If we aren't vectorizing, we can just copy the scalar map values over to 2170 // the vector map. 2171 if (VF.isScalar()) { 2172 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2173 return ScalarValue; 2174 } 2175 2176 // Get the last scalar instruction we generated for V and Part. If the value 2177 // is known to be uniform after vectorization, this corresponds to lane zero 2178 // of the Part unroll iteration. Otherwise, the last instruction is the one 2179 // we created for the last vector lane of the Part unroll iteration. 2180 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2181 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2182 ? 0 2183 : VF.getKnownMinValue() - 1; 2184 auto *LastInst = cast<Instruction>( 2185 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2186 2187 // Set the insert point after the last scalarized instruction. This ensures 2188 // the insertelement sequence will directly follow the scalar definitions. 2189 auto OldIP = Builder.saveIP(); 2190 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2191 Builder.SetInsertPoint(&*NewIP); 2192 2193 // However, if we are vectorizing, we need to construct the vector values. 2194 // If the value is known to be uniform after vectorization, we can just 2195 // broadcast the scalar value corresponding to lane zero for each unroll 2196 // iteration. Otherwise, we construct the vector values using insertelement 2197 // instructions. Since the resulting vectors are stored in 2198 // VectorLoopValueMap, we will only generate the insertelements once. 2199 Value *VectorValue = nullptr; 2200 if (Cost->isUniformAfterVectorization(I, VF)) { 2201 VectorValue = getBroadcastInstrs(ScalarValue); 2202 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2203 } else { 2204 // Initialize packing with insertelements to start from undef. 2205 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2206 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2207 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2208 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2209 packScalarIntoVectorValue(V, {Part, Lane}); 2210 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2211 } 2212 Builder.restoreIP(OldIP); 2213 return VectorValue; 2214 } 2215 2216 // If this scalar is unknown, assume that it is a constant or that it is 2217 // loop invariant. Broadcast V and save the value for future uses. 2218 Value *B = getBroadcastInstrs(V); 2219 VectorLoopValueMap.setVectorValue(V, Part, B); 2220 return B; 2221 } 2222 2223 Value * 2224 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2225 const VPIteration &Instance) { 2226 // If the value is not an instruction contained in the loop, it should 2227 // already be scalar. 2228 if (OrigLoop->isLoopInvariant(V)) 2229 return V; 2230 2231 assert(Instance.Lane > 0 2232 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2233 : true && "Uniform values only have lane zero"); 2234 2235 // If the value from the original loop has not been vectorized, it is 2236 // represented by UF x VF scalar values in the new loop. Return the requested 2237 // scalar value. 2238 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2239 return VectorLoopValueMap.getScalarValue(V, Instance); 2240 2241 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2242 // for the given unroll part. If this entry is not a vector type (i.e., the 2243 // vectorization factor is one), there is no need to generate an 2244 // extractelement instruction. 2245 auto *U = getOrCreateVectorValue(V, Instance.Part); 2246 if (!U->getType()->isVectorTy()) { 2247 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2248 return U; 2249 } 2250 2251 // Otherwise, the value from the original loop has been vectorized and is 2252 // represented by UF vector values. Extract and return the requested scalar 2253 // value from the appropriate vector lane. 2254 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2255 } 2256 2257 void InnerLoopVectorizer::packScalarIntoVectorValue( 2258 Value *V, const VPIteration &Instance) { 2259 assert(V != Induction && "The new induction variable should not be used."); 2260 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2261 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2262 2263 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2264 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2265 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2266 Builder.getInt32(Instance.Lane)); 2267 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2268 } 2269 2270 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2271 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2272 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2273 SmallVector<int, 8> ShuffleMask; 2274 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2275 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2276 2277 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2278 } 2279 2280 // Return whether we allow using masked interleave-groups (for dealing with 2281 // strided loads/stores that reside in predicated blocks, or for dealing 2282 // with gaps). 2283 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2284 // If an override option has been passed in for interleaved accesses, use it. 2285 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2286 return EnableMaskedInterleavedMemAccesses; 2287 2288 return TTI.enableMaskedInterleavedAccessVectorization(); 2289 } 2290 2291 // Try to vectorize the interleave group that \p Instr belongs to. 2292 // 2293 // E.g. Translate following interleaved load group (factor = 3): 2294 // for (i = 0; i < N; i+=3) { 2295 // R = Pic[i]; // Member of index 0 2296 // G = Pic[i+1]; // Member of index 1 2297 // B = Pic[i+2]; // Member of index 2 2298 // ... // do something to R, G, B 2299 // } 2300 // To: 2301 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2302 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2303 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2304 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2305 // 2306 // Or translate following interleaved store group (factor = 3): 2307 // for (i = 0; i < N; i+=3) { 2308 // ... do something to R, G, B 2309 // Pic[i] = R; // Member of index 0 2310 // Pic[i+1] = G; // Member of index 1 2311 // Pic[i+2] = B; // Member of index 2 2312 // } 2313 // To: 2314 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2315 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2316 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2317 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2318 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2319 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2320 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2321 VPValue *Addr, VPValue *BlockInMask) { 2322 Instruction *Instr = Group->getInsertPos(); 2323 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2324 2325 // Prepare for the vector type of the interleaved load/store. 2326 Type *ScalarTy = getMemInstValueType(Instr); 2327 unsigned InterleaveFactor = Group->getFactor(); 2328 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2329 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2330 2331 // Prepare for the new pointers. 2332 SmallVector<Value *, 2> AddrParts; 2333 unsigned Index = Group->getIndex(Instr); 2334 2335 // TODO: extend the masked interleaved-group support to reversed access. 2336 assert((!BlockInMask || !Group->isReverse()) && 2337 "Reversed masked interleave-group not supported."); 2338 2339 // If the group is reverse, adjust the index to refer to the last vector lane 2340 // instead of the first. We adjust the index from the first vector lane, 2341 // rather than directly getting the pointer for lane VF - 1, because the 2342 // pointer operand of the interleaved access is supposed to be uniform. For 2343 // uniform instructions, we're only required to generate a value for the 2344 // first vector lane in each unroll iteration. 2345 assert(!VF.isScalable() && 2346 "scalable vector reverse operation is not implemented"); 2347 if (Group->isReverse()) 2348 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2349 2350 for (unsigned Part = 0; Part < UF; Part++) { 2351 Value *AddrPart = State.get(Addr, {Part, 0}); 2352 setDebugLocFromInst(Builder, AddrPart); 2353 2354 // Notice current instruction could be any index. Need to adjust the address 2355 // to the member of index 0. 2356 // 2357 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2358 // b = A[i]; // Member of index 0 2359 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2360 // 2361 // E.g. A[i+1] = a; // Member of index 1 2362 // A[i] = b; // Member of index 0 2363 // A[i+2] = c; // Member of index 2 (Current instruction) 2364 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2365 2366 bool InBounds = false; 2367 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2368 InBounds = gep->isInBounds(); 2369 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2370 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2371 2372 // Cast to the vector pointer type. 2373 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2374 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2375 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2376 } 2377 2378 setDebugLocFromInst(Builder, Instr); 2379 Value *UndefVec = UndefValue::get(VecTy); 2380 2381 Value *MaskForGaps = nullptr; 2382 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2383 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2384 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2385 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2386 } 2387 2388 // Vectorize the interleaved load group. 2389 if (isa<LoadInst>(Instr)) { 2390 // For each unroll part, create a wide load for the group. 2391 SmallVector<Value *, 2> NewLoads; 2392 for (unsigned Part = 0; Part < UF; Part++) { 2393 Instruction *NewLoad; 2394 if (BlockInMask || MaskForGaps) { 2395 assert(useMaskedInterleavedAccesses(*TTI) && 2396 "masked interleaved groups are not allowed."); 2397 Value *GroupMask = MaskForGaps; 2398 if (BlockInMask) { 2399 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2400 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2401 Value *ShuffledMask = Builder.CreateShuffleVector( 2402 BlockInMaskPart, 2403 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2404 "interleaved.mask"); 2405 GroupMask = MaskForGaps 2406 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2407 MaskForGaps) 2408 : ShuffledMask; 2409 } 2410 NewLoad = 2411 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2412 GroupMask, UndefVec, "wide.masked.vec"); 2413 } 2414 else 2415 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2416 Group->getAlign(), "wide.vec"); 2417 Group->addMetadata(NewLoad); 2418 NewLoads.push_back(NewLoad); 2419 } 2420 2421 // For each member in the group, shuffle out the appropriate data from the 2422 // wide loads. 2423 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2424 Instruction *Member = Group->getMember(I); 2425 2426 // Skip the gaps in the group. 2427 if (!Member) 2428 continue; 2429 2430 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2431 auto StrideMask = 2432 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2433 for (unsigned Part = 0; Part < UF; Part++) { 2434 Value *StridedVec = Builder.CreateShuffleVector( 2435 NewLoads[Part], StrideMask, "strided.vec"); 2436 2437 // If this member has different type, cast the result type. 2438 if (Member->getType() != ScalarTy) { 2439 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2440 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2441 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2442 } 2443 2444 if (Group->isReverse()) 2445 StridedVec = reverseVector(StridedVec); 2446 2447 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2448 } 2449 } 2450 return; 2451 } 2452 2453 // The sub vector type for current instruction. 2454 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2455 auto *SubVT = VectorType::get(ScalarTy, VF); 2456 2457 // Vectorize the interleaved store group. 2458 for (unsigned Part = 0; Part < UF; Part++) { 2459 // Collect the stored vector from each member. 2460 SmallVector<Value *, 4> StoredVecs; 2461 for (unsigned i = 0; i < InterleaveFactor; i++) { 2462 // Interleaved store group doesn't allow a gap, so each index has a member 2463 Instruction *Member = Group->getMember(i); 2464 assert(Member && "Fail to get a member from an interleaved store group"); 2465 2466 Value *StoredVec = getOrCreateVectorValue( 2467 cast<StoreInst>(Member)->getValueOperand(), Part); 2468 if (Group->isReverse()) 2469 StoredVec = reverseVector(StoredVec); 2470 2471 // If this member has different type, cast it to a unified type. 2472 2473 if (StoredVec->getType() != SubVT) 2474 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2475 2476 StoredVecs.push_back(StoredVec); 2477 } 2478 2479 // Concatenate all vectors into a wide vector. 2480 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2481 2482 // Interleave the elements in the wide vector. 2483 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2484 Value *IVec = Builder.CreateShuffleVector( 2485 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2486 "interleaved.vec"); 2487 2488 Instruction *NewStoreInstr; 2489 if (BlockInMask) { 2490 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2491 Value *ShuffledMask = Builder.CreateShuffleVector( 2492 BlockInMaskPart, 2493 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2494 "interleaved.mask"); 2495 NewStoreInstr = Builder.CreateMaskedStore( 2496 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2497 } 2498 else 2499 NewStoreInstr = 2500 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2501 2502 Group->addMetadata(NewStoreInstr); 2503 } 2504 } 2505 2506 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2507 VPTransformState &State, 2508 VPValue *Addr, 2509 VPValue *StoredValue, 2510 VPValue *BlockInMask) { 2511 // Attempt to issue a wide load. 2512 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2513 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2514 2515 assert((LI || SI) && "Invalid Load/Store instruction"); 2516 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2517 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2518 2519 LoopVectorizationCostModel::InstWidening Decision = 2520 Cost->getWideningDecision(Instr, VF); 2521 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2522 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2523 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2524 "CM decision is not to widen the memory instruction"); 2525 2526 Type *ScalarDataTy = getMemInstValueType(Instr); 2527 2528 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2529 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2530 const Align Alignment = getLoadStoreAlignment(Instr); 2531 2532 // Determine if the pointer operand of the access is either consecutive or 2533 // reverse consecutive. 2534 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2535 bool ConsecutiveStride = 2536 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2537 bool CreateGatherScatter = 2538 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2539 2540 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2541 // gather/scatter. Otherwise Decision should have been to Scalarize. 2542 assert((ConsecutiveStride || CreateGatherScatter) && 2543 "The instruction should be scalarized"); 2544 (void)ConsecutiveStride; 2545 2546 VectorParts BlockInMaskParts(UF); 2547 bool isMaskRequired = BlockInMask; 2548 if (isMaskRequired) 2549 for (unsigned Part = 0; Part < UF; ++Part) 2550 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2551 2552 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2553 // Calculate the pointer for the specific unroll-part. 2554 GetElementPtrInst *PartPtr = nullptr; 2555 2556 bool InBounds = false; 2557 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2558 InBounds = gep->isInBounds(); 2559 2560 if (Reverse) { 2561 // If the address is consecutive but reversed, then the 2562 // wide store needs to start at the last vector element. 2563 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2564 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2565 PartPtr->setIsInBounds(InBounds); 2566 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2567 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2568 PartPtr->setIsInBounds(InBounds); 2569 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2570 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2571 } else { 2572 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2573 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2574 PartPtr->setIsInBounds(InBounds); 2575 } 2576 2577 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2578 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2579 }; 2580 2581 // Handle Stores: 2582 if (SI) { 2583 setDebugLocFromInst(Builder, SI); 2584 2585 for (unsigned Part = 0; Part < UF; ++Part) { 2586 Instruction *NewSI = nullptr; 2587 Value *StoredVal = State.get(StoredValue, Part); 2588 if (CreateGatherScatter) { 2589 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2590 Value *VectorGep = State.get(Addr, Part); 2591 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2592 MaskPart); 2593 } else { 2594 if (Reverse) { 2595 // If we store to reverse consecutive memory locations, then we need 2596 // to reverse the order of elements in the stored value. 2597 StoredVal = reverseVector(StoredVal); 2598 // We don't want to update the value in the map as it might be used in 2599 // another expression. So don't call resetVectorValue(StoredVal). 2600 } 2601 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2602 if (isMaskRequired) 2603 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2604 BlockInMaskParts[Part]); 2605 else 2606 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2607 } 2608 addMetadata(NewSI, SI); 2609 } 2610 return; 2611 } 2612 2613 // Handle loads. 2614 assert(LI && "Must have a load instruction"); 2615 setDebugLocFromInst(Builder, LI); 2616 for (unsigned Part = 0; Part < UF; ++Part) { 2617 Value *NewLI; 2618 if (CreateGatherScatter) { 2619 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2620 Value *VectorGep = State.get(Addr, Part); 2621 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2622 nullptr, "wide.masked.gather"); 2623 addMetadata(NewLI, LI); 2624 } else { 2625 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2626 if (isMaskRequired) 2627 NewLI = Builder.CreateMaskedLoad( 2628 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2629 "wide.masked.load"); 2630 else 2631 NewLI = 2632 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2633 2634 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2635 addMetadata(NewLI, LI); 2636 if (Reverse) 2637 NewLI = reverseVector(NewLI); 2638 } 2639 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2640 } 2641 } 2642 2643 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2644 const VPIteration &Instance, 2645 bool IfPredicateInstr, 2646 VPTransformState &State) { 2647 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2648 2649 setDebugLocFromInst(Builder, Instr); 2650 2651 // Does this instruction return a value ? 2652 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2653 2654 Instruction *Cloned = Instr->clone(); 2655 if (!IsVoidRetTy) 2656 Cloned->setName(Instr->getName() + ".cloned"); 2657 2658 // Replace the operands of the cloned instructions with their scalar 2659 // equivalents in the new loop. 2660 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2661 auto *NewOp = State.get(User.getOperand(op), Instance); 2662 Cloned->setOperand(op, NewOp); 2663 } 2664 addNewMetadata(Cloned, Instr); 2665 2666 // Place the cloned scalar in the new loop. 2667 Builder.Insert(Cloned); 2668 2669 // Add the cloned scalar to the scalar map entry. 2670 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2671 2672 // If we just cloned a new assumption, add it the assumption cache. 2673 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2674 if (II->getIntrinsicID() == Intrinsic::assume) 2675 AC->registerAssumption(II); 2676 2677 // End if-block. 2678 if (IfPredicateInstr) 2679 PredicatedInstructions.push_back(Cloned); 2680 } 2681 2682 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2683 Value *End, Value *Step, 2684 Instruction *DL) { 2685 BasicBlock *Header = L->getHeader(); 2686 BasicBlock *Latch = L->getLoopLatch(); 2687 // As we're just creating this loop, it's possible no latch exists 2688 // yet. If so, use the header as this will be a single block loop. 2689 if (!Latch) 2690 Latch = Header; 2691 2692 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2693 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2694 setDebugLocFromInst(Builder, OldInst); 2695 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2696 2697 Builder.SetInsertPoint(Latch->getTerminator()); 2698 setDebugLocFromInst(Builder, OldInst); 2699 2700 // Create i+1 and fill the PHINode. 2701 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2702 Induction->addIncoming(Start, L->getLoopPreheader()); 2703 Induction->addIncoming(Next, Latch); 2704 // Create the compare. 2705 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2706 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2707 2708 // Now we have two terminators. Remove the old one from the block. 2709 Latch->getTerminator()->eraseFromParent(); 2710 2711 return Induction; 2712 } 2713 2714 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2715 if (TripCount) 2716 return TripCount; 2717 2718 assert(L && "Create Trip Count for null loop."); 2719 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2720 // Find the loop boundaries. 2721 ScalarEvolution *SE = PSE.getSE(); 2722 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2723 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2724 "Invalid loop count"); 2725 2726 Type *IdxTy = Legal->getWidestInductionType(); 2727 assert(IdxTy && "No type for induction"); 2728 2729 // The exit count might have the type of i64 while the phi is i32. This can 2730 // happen if we have an induction variable that is sign extended before the 2731 // compare. The only way that we get a backedge taken count is that the 2732 // induction variable was signed and as such will not overflow. In such a case 2733 // truncation is legal. 2734 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2735 IdxTy->getPrimitiveSizeInBits()) 2736 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2737 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2738 2739 // Get the total trip count from the count by adding 1. 2740 const SCEV *ExitCount = SE->getAddExpr( 2741 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2742 2743 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2744 2745 // Expand the trip count and place the new instructions in the preheader. 2746 // Notice that the pre-header does not change, only the loop body. 2747 SCEVExpander Exp(*SE, DL, "induction"); 2748 2749 // Count holds the overall loop count (N). 2750 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2751 L->getLoopPreheader()->getTerminator()); 2752 2753 if (TripCount->getType()->isPointerTy()) 2754 TripCount = 2755 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2756 L->getLoopPreheader()->getTerminator()); 2757 2758 return TripCount; 2759 } 2760 2761 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2762 if (VectorTripCount) 2763 return VectorTripCount; 2764 2765 Value *TC = getOrCreateTripCount(L); 2766 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2767 2768 Type *Ty = TC->getType(); 2769 // This is where we can make the step a runtime constant. 2770 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2771 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2772 2773 // If the tail is to be folded by masking, round the number of iterations N 2774 // up to a multiple of Step instead of rounding down. This is done by first 2775 // adding Step-1 and then rounding down. Note that it's ok if this addition 2776 // overflows: the vector induction variable will eventually wrap to zero given 2777 // that it starts at zero and its Step is a power of two; the loop will then 2778 // exit, with the last early-exit vector comparison also producing all-true. 2779 if (Cost->foldTailByMasking()) { 2780 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2781 "VF*UF must be a power of 2 when folding tail by masking"); 2782 TC = Builder.CreateAdd( 2783 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2784 } 2785 2786 // Now we need to generate the expression for the part of the loop that the 2787 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2788 // iterations are not required for correctness, or N - Step, otherwise. Step 2789 // is equal to the vectorization factor (number of SIMD elements) times the 2790 // unroll factor (number of SIMD instructions). 2791 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2792 2793 // If there is a non-reversed interleaved group that may speculatively access 2794 // memory out-of-bounds, we need to ensure that there will be at least one 2795 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2796 // the trip count, we set the remainder to be equal to the step. If the step 2797 // does not evenly divide the trip count, no adjustment is necessary since 2798 // there will already be scalar iterations. Note that the minimum iterations 2799 // check ensures that N >= Step. 2800 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2801 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2802 R = Builder.CreateSelect(IsZero, Step, R); 2803 } 2804 2805 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2806 2807 return VectorTripCount; 2808 } 2809 2810 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2811 const DataLayout &DL) { 2812 // Verify that V is a vector type with same number of elements as DstVTy. 2813 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2814 unsigned VF = DstFVTy->getNumElements(); 2815 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2816 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2817 Type *SrcElemTy = SrcVecTy->getElementType(); 2818 Type *DstElemTy = DstFVTy->getElementType(); 2819 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2820 "Vector elements must have same size"); 2821 2822 // Do a direct cast if element types are castable. 2823 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2824 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2825 } 2826 // V cannot be directly casted to desired vector type. 2827 // May happen when V is a floating point vector but DstVTy is a vector of 2828 // pointers or vice-versa. Handle this using a two-step bitcast using an 2829 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2830 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2831 "Only one type should be a pointer type"); 2832 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2833 "Only one type should be a floating point type"); 2834 Type *IntTy = 2835 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2836 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2837 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2838 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2839 } 2840 2841 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2842 BasicBlock *Bypass) { 2843 Value *Count = getOrCreateTripCount(L); 2844 // Reuse existing vector loop preheader for TC checks. 2845 // Note that new preheader block is generated for vector loop. 2846 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2847 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2848 2849 // Generate code to check if the loop's trip count is less than VF * UF, or 2850 // equal to it in case a scalar epilogue is required; this implies that the 2851 // vector trip count is zero. This check also covers the case where adding one 2852 // to the backedge-taken count overflowed leading to an incorrect trip count 2853 // of zero. In this case we will also jump to the scalar loop. 2854 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2855 : ICmpInst::ICMP_ULT; 2856 2857 // If tail is to be folded, vector loop takes care of all iterations. 2858 Value *CheckMinIters = Builder.getFalse(); 2859 if (!Cost->foldTailByMasking()) { 2860 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2861 CheckMinIters = Builder.CreateICmp( 2862 P, Count, 2863 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2864 "min.iters.check"); 2865 } 2866 // Create new preheader for vector loop. 2867 LoopVectorPreHeader = 2868 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2869 "vector.ph"); 2870 2871 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2872 DT->getNode(Bypass)->getIDom()) && 2873 "TC check is expected to dominate Bypass"); 2874 2875 // Update dominator for Bypass & LoopExit. 2876 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2877 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2878 2879 ReplaceInstWithInst( 2880 TCCheckBlock->getTerminator(), 2881 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2882 LoopBypassBlocks.push_back(TCCheckBlock); 2883 } 2884 2885 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2886 // Reuse existing vector loop preheader for SCEV checks. 2887 // Note that new preheader block is generated for vector loop. 2888 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2889 2890 // Generate the code to check that the SCEV assumptions that we made. 2891 // We want the new basic block to start at the first instruction in a 2892 // sequence of instructions that form a check. 2893 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2894 "scev.check"); 2895 Value *SCEVCheck = Exp.expandCodeForPredicate( 2896 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2897 2898 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2899 if (C->isZero()) 2900 return; 2901 2902 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2903 (OptForSizeBasedOnProfile && 2904 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2905 "Cannot SCEV check stride or overflow when optimizing for size"); 2906 2907 SCEVCheckBlock->setName("vector.scevcheck"); 2908 // Create new preheader for vector loop. 2909 LoopVectorPreHeader = 2910 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2911 nullptr, "vector.ph"); 2912 2913 // Update dominator only if this is first RT check. 2914 if (LoopBypassBlocks.empty()) { 2915 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2916 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2917 } 2918 2919 ReplaceInstWithInst( 2920 SCEVCheckBlock->getTerminator(), 2921 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2922 LoopBypassBlocks.push_back(SCEVCheckBlock); 2923 AddedSafetyChecks = true; 2924 } 2925 2926 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2927 // VPlan-native path does not do any analysis for runtime checks currently. 2928 if (EnableVPlanNativePath) 2929 return; 2930 2931 // Reuse existing vector loop preheader for runtime memory checks. 2932 // Note that new preheader block is generated for vector loop. 2933 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2934 2935 // Generate the code that checks in runtime if arrays overlap. We put the 2936 // checks into a separate block to make the more common case of few elements 2937 // faster. 2938 auto *LAI = Legal->getLAI(); 2939 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2940 if (!RtPtrChecking.Need) 2941 return; 2942 2943 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2944 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2945 "Cannot emit memory checks when optimizing for size, unless forced " 2946 "to vectorize."); 2947 ORE->emit([&]() { 2948 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2949 L->getStartLoc(), L->getHeader()) 2950 << "Code-size may be reduced by not forcing " 2951 "vectorization, or by source-code modifications " 2952 "eliminating the need for runtime checks " 2953 "(e.g., adding 'restrict')."; 2954 }); 2955 } 2956 2957 MemCheckBlock->setName("vector.memcheck"); 2958 // Create new preheader for vector loop. 2959 LoopVectorPreHeader = 2960 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2961 "vector.ph"); 2962 2963 auto *CondBranch = cast<BranchInst>( 2964 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 2965 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 2966 LoopBypassBlocks.push_back(MemCheckBlock); 2967 AddedSafetyChecks = true; 2968 2969 // Update dominator only if this is first RT check. 2970 if (LoopBypassBlocks.empty()) { 2971 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2972 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2973 } 2974 2975 Instruction *FirstCheckInst; 2976 Instruction *MemRuntimeCheck; 2977 std::tie(FirstCheckInst, MemRuntimeCheck) = 2978 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2979 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2980 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2981 "claimed checks are required"); 2982 CondBranch->setCondition(MemRuntimeCheck); 2983 2984 // We currently don't use LoopVersioning for the actual loop cloning but we 2985 // still use it to add the noalias metadata. 2986 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2987 PSE.getSE()); 2988 LVer->prepareNoAliasMetadata(); 2989 } 2990 2991 Value *InnerLoopVectorizer::emitTransformedIndex( 2992 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2993 const InductionDescriptor &ID) const { 2994 2995 SCEVExpander Exp(*SE, DL, "induction"); 2996 auto Step = ID.getStep(); 2997 auto StartValue = ID.getStartValue(); 2998 assert(Index->getType() == Step->getType() && 2999 "Index type does not match StepValue type"); 3000 3001 // Note: the IR at this point is broken. We cannot use SE to create any new 3002 // SCEV and then expand it, hoping that SCEV's simplification will give us 3003 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3004 // lead to various SCEV crashes. So all we can do is to use builder and rely 3005 // on InstCombine for future simplifications. Here we handle some trivial 3006 // cases only. 3007 auto CreateAdd = [&B](Value *X, Value *Y) { 3008 assert(X->getType() == Y->getType() && "Types don't match!"); 3009 if (auto *CX = dyn_cast<ConstantInt>(X)) 3010 if (CX->isZero()) 3011 return Y; 3012 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3013 if (CY->isZero()) 3014 return X; 3015 return B.CreateAdd(X, Y); 3016 }; 3017 3018 auto CreateMul = [&B](Value *X, Value *Y) { 3019 assert(X->getType() == Y->getType() && "Types don't match!"); 3020 if (auto *CX = dyn_cast<ConstantInt>(X)) 3021 if (CX->isOne()) 3022 return Y; 3023 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3024 if (CY->isOne()) 3025 return X; 3026 return B.CreateMul(X, Y); 3027 }; 3028 3029 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3030 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3031 // the DomTree is not kept up-to-date for additional blocks generated in the 3032 // vector loop. By using the header as insertion point, we guarantee that the 3033 // expanded instructions dominate all their uses. 3034 auto GetInsertPoint = [this, &B]() { 3035 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3036 if (InsertBB != LoopVectorBody && 3037 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3038 return LoopVectorBody->getTerminator(); 3039 return &*B.GetInsertPoint(); 3040 }; 3041 switch (ID.getKind()) { 3042 case InductionDescriptor::IK_IntInduction: { 3043 assert(Index->getType() == StartValue->getType() && 3044 "Index type does not match StartValue type"); 3045 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3046 return B.CreateSub(StartValue, Index); 3047 auto *Offset = CreateMul( 3048 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3049 return CreateAdd(StartValue, Offset); 3050 } 3051 case InductionDescriptor::IK_PtrInduction: { 3052 assert(isa<SCEVConstant>(Step) && 3053 "Expected constant step for pointer induction"); 3054 return B.CreateGEP( 3055 StartValue->getType()->getPointerElementType(), StartValue, 3056 CreateMul(Index, 3057 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3058 } 3059 case InductionDescriptor::IK_FpInduction: { 3060 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3061 auto InductionBinOp = ID.getInductionBinOp(); 3062 assert(InductionBinOp && 3063 (InductionBinOp->getOpcode() == Instruction::FAdd || 3064 InductionBinOp->getOpcode() == Instruction::FSub) && 3065 "Original bin op should be defined for FP induction"); 3066 3067 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3068 3069 // Floating point operations had to be 'fast' to enable the induction. 3070 FastMathFlags Flags; 3071 Flags.setFast(); 3072 3073 Value *MulExp = B.CreateFMul(StepValue, Index); 3074 if (isa<Instruction>(MulExp)) 3075 // We have to check, the MulExp may be a constant. 3076 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3077 3078 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3079 "induction"); 3080 if (isa<Instruction>(BOp)) 3081 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3082 3083 return BOp; 3084 } 3085 case InductionDescriptor::IK_NoInduction: 3086 return nullptr; 3087 } 3088 llvm_unreachable("invalid enum"); 3089 } 3090 3091 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3092 LoopScalarBody = OrigLoop->getHeader(); 3093 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3094 LoopExitBlock = OrigLoop->getExitBlock(); 3095 assert(LoopExitBlock && "Must have an exit block"); 3096 assert(LoopVectorPreHeader && "Invalid loop structure"); 3097 3098 LoopMiddleBlock = 3099 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3100 LI, nullptr, Twine(Prefix) + "middle.block"); 3101 LoopScalarPreHeader = 3102 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3103 nullptr, Twine(Prefix) + "scalar.ph"); 3104 // We intentionally don't let SplitBlock to update LoopInfo since 3105 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3106 // LoopVectorBody is explicitly added to the correct place few lines later. 3107 LoopVectorBody = 3108 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3109 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3110 3111 // Update dominator for loop exit. 3112 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3113 3114 // Create and register the new vector loop. 3115 Loop *Lp = LI->AllocateLoop(); 3116 Loop *ParentLoop = OrigLoop->getParentLoop(); 3117 3118 // Insert the new loop into the loop nest and register the new basic blocks 3119 // before calling any utilities such as SCEV that require valid LoopInfo. 3120 if (ParentLoop) { 3121 ParentLoop->addChildLoop(Lp); 3122 } else { 3123 LI->addTopLevelLoop(Lp); 3124 } 3125 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3126 return Lp; 3127 } 3128 3129 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3130 Value *VectorTripCount) { 3131 assert(VectorTripCount && L && "Expected valid arguments"); 3132 // We are going to resume the execution of the scalar loop. 3133 // Go over all of the induction variables that we found and fix the 3134 // PHIs that are left in the scalar version of the loop. 3135 // The starting values of PHI nodes depend on the counter of the last 3136 // iteration in the vectorized loop. 3137 // If we come from a bypass edge then we need to start from the original 3138 // start value. 3139 for (auto &InductionEntry : Legal->getInductionVars()) { 3140 PHINode *OrigPhi = InductionEntry.first; 3141 InductionDescriptor II = InductionEntry.second; 3142 3143 // Create phi nodes to merge from the backedge-taken check block. 3144 PHINode *BCResumeVal = 3145 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3146 LoopScalarPreHeader->getTerminator()); 3147 // Copy original phi DL over to the new one. 3148 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3149 Value *&EndValue = IVEndValues[OrigPhi]; 3150 if (OrigPhi == OldInduction) { 3151 // We know what the end value is. 3152 EndValue = VectorTripCount; 3153 } else { 3154 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3155 Type *StepType = II.getStep()->getType(); 3156 Instruction::CastOps CastOp = 3157 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3158 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3159 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3160 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3161 EndValue->setName("ind.end"); 3162 } 3163 3164 // The new PHI merges the original incoming value, in case of a bypass, 3165 // or the value at the end of the vectorized loop. 3166 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3167 3168 // Fix the scalar body counter (PHI node). 3169 // The old induction's phi node in the scalar body needs the truncated 3170 // value. 3171 for (BasicBlock *BB : LoopBypassBlocks) 3172 BCResumeVal->addIncoming(II.getStartValue(), BB); 3173 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3174 } 3175 } 3176 3177 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3178 MDNode *OrigLoopID) { 3179 assert(L && "Expected valid loop."); 3180 3181 // The trip counts should be cached by now. 3182 Value *Count = getOrCreateTripCount(L); 3183 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3184 3185 // We need the OrigLoop (scalar loop part) latch terminator to help 3186 // produce correct debug info for the middle block BB instructions. 3187 // The legality check stage guarantees that the loop will have a single 3188 // latch. 3189 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3190 "Scalar loop latch terminator isn't a branch"); 3191 BranchInst *ScalarLatchBr = 3192 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3193 3194 // Add a check in the middle block to see if we have completed 3195 // all of the iterations in the first vector loop. 3196 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3197 // If tail is to be folded, we know we don't need to run the remainder. 3198 Value *CmpN = Builder.getTrue(); 3199 if (!Cost->foldTailByMasking()) { 3200 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3201 VectorTripCount, "cmp.n", 3202 LoopMiddleBlock->getTerminator()); 3203 3204 // Here we use the same DebugLoc as the scalar loop latch branch instead 3205 // of the corresponding compare because they may have ended up with 3206 // different line numbers and we want to avoid awkward line stepping while 3207 // debugging. Eg. if the compare has got a line number inside the loop. 3208 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3209 } 3210 3211 BranchInst *BrInst = 3212 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3213 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3214 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3215 3216 // Get ready to start creating new instructions into the vectorized body. 3217 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3218 "Inconsistent vector loop preheader"); 3219 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3220 3221 Optional<MDNode *> VectorizedLoopID = 3222 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3223 LLVMLoopVectorizeFollowupVectorized}); 3224 if (VectorizedLoopID.hasValue()) { 3225 L->setLoopID(VectorizedLoopID.getValue()); 3226 3227 // Do not setAlreadyVectorized if loop attributes have been defined 3228 // explicitly. 3229 return LoopVectorPreHeader; 3230 } 3231 3232 // Keep all loop hints from the original loop on the vector loop (we'll 3233 // replace the vectorizer-specific hints below). 3234 if (MDNode *LID = OrigLoop->getLoopID()) 3235 L->setLoopID(LID); 3236 3237 LoopVectorizeHints Hints(L, true, *ORE); 3238 Hints.setAlreadyVectorized(); 3239 3240 #ifdef EXPENSIVE_CHECKS 3241 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3242 LI->verify(*DT); 3243 #endif 3244 3245 return LoopVectorPreHeader; 3246 } 3247 3248 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3249 /* 3250 In this function we generate a new loop. The new loop will contain 3251 the vectorized instructions while the old loop will continue to run the 3252 scalar remainder. 3253 3254 [ ] <-- loop iteration number check. 3255 / | 3256 / v 3257 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3258 | / | 3259 | / v 3260 || [ ] <-- vector pre header. 3261 |/ | 3262 | v 3263 | [ ] \ 3264 | [ ]_| <-- vector loop. 3265 | | 3266 | v 3267 | -[ ] <--- middle-block. 3268 | / | 3269 | / v 3270 -|- >[ ] <--- new preheader. 3271 | | 3272 | v 3273 | [ ] \ 3274 | [ ]_| <-- old scalar loop to handle remainder. 3275 \ | 3276 \ v 3277 >[ ] <-- exit block. 3278 ... 3279 */ 3280 3281 // Get the metadata of the original loop before it gets modified. 3282 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3283 3284 // Create an empty vector loop, and prepare basic blocks for the runtime 3285 // checks. 3286 Loop *Lp = createVectorLoopSkeleton(""); 3287 3288 // Now, compare the new count to zero. If it is zero skip the vector loop and 3289 // jump to the scalar loop. This check also covers the case where the 3290 // backedge-taken count is uint##_max: adding one to it will overflow leading 3291 // to an incorrect trip count of zero. In this (rare) case we will also jump 3292 // to the scalar loop. 3293 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3294 3295 // Generate the code to check any assumptions that we've made for SCEV 3296 // expressions. 3297 emitSCEVChecks(Lp, LoopScalarPreHeader); 3298 3299 // Generate the code that checks in runtime if arrays overlap. We put the 3300 // checks into a separate block to make the more common case of few elements 3301 // faster. 3302 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3303 3304 // Some loops have a single integer induction variable, while other loops 3305 // don't. One example is c++ iterators that often have multiple pointer 3306 // induction variables. In the code below we also support a case where we 3307 // don't have a single induction variable. 3308 // 3309 // We try to obtain an induction variable from the original loop as hard 3310 // as possible. However if we don't find one that: 3311 // - is an integer 3312 // - counts from zero, stepping by one 3313 // - is the size of the widest induction variable type 3314 // then we create a new one. 3315 OldInduction = Legal->getPrimaryInduction(); 3316 Type *IdxTy = Legal->getWidestInductionType(); 3317 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3318 // The loop step is equal to the vectorization factor (num of SIMD elements) 3319 // times the unroll factor (num of SIMD instructions). 3320 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3321 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3322 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3323 Induction = 3324 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3325 getDebugLocFromInstOrOperands(OldInduction)); 3326 3327 // Emit phis for the new starting index of the scalar loop. 3328 createInductionResumeValues(Lp, CountRoundDown); 3329 3330 return completeLoopSkeleton(Lp, OrigLoopID); 3331 } 3332 3333 // Fix up external users of the induction variable. At this point, we are 3334 // in LCSSA form, with all external PHIs that use the IV having one input value, 3335 // coming from the remainder loop. We need those PHIs to also have a correct 3336 // value for the IV when arriving directly from the middle block. 3337 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3338 const InductionDescriptor &II, 3339 Value *CountRoundDown, Value *EndValue, 3340 BasicBlock *MiddleBlock) { 3341 // There are two kinds of external IV usages - those that use the value 3342 // computed in the last iteration (the PHI) and those that use the penultimate 3343 // value (the value that feeds into the phi from the loop latch). 3344 // We allow both, but they, obviously, have different values. 3345 3346 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3347 3348 DenseMap<Value *, Value *> MissingVals; 3349 3350 // An external user of the last iteration's value should see the value that 3351 // the remainder loop uses to initialize its own IV. 3352 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3353 for (User *U : PostInc->users()) { 3354 Instruction *UI = cast<Instruction>(U); 3355 if (!OrigLoop->contains(UI)) { 3356 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3357 MissingVals[UI] = EndValue; 3358 } 3359 } 3360 3361 // An external user of the penultimate value need to see EndValue - Step. 3362 // The simplest way to get this is to recompute it from the constituent SCEVs, 3363 // that is Start + (Step * (CRD - 1)). 3364 for (User *U : OrigPhi->users()) { 3365 auto *UI = cast<Instruction>(U); 3366 if (!OrigLoop->contains(UI)) { 3367 const DataLayout &DL = 3368 OrigLoop->getHeader()->getModule()->getDataLayout(); 3369 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3370 3371 IRBuilder<> B(MiddleBlock->getTerminator()); 3372 Value *CountMinusOne = B.CreateSub( 3373 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3374 Value *CMO = 3375 !II.getStep()->getType()->isIntegerTy() 3376 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3377 II.getStep()->getType()) 3378 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3379 CMO->setName("cast.cmo"); 3380 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3381 Escape->setName("ind.escape"); 3382 MissingVals[UI] = Escape; 3383 } 3384 } 3385 3386 for (auto &I : MissingVals) { 3387 PHINode *PHI = cast<PHINode>(I.first); 3388 // One corner case we have to handle is two IVs "chasing" each-other, 3389 // that is %IV2 = phi [...], [ %IV1, %latch ] 3390 // In this case, if IV1 has an external use, we need to avoid adding both 3391 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3392 // don't already have an incoming value for the middle block. 3393 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3394 PHI->addIncoming(I.second, MiddleBlock); 3395 } 3396 } 3397 3398 namespace { 3399 3400 struct CSEDenseMapInfo { 3401 static bool canHandle(const Instruction *I) { 3402 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3403 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3404 } 3405 3406 static inline Instruction *getEmptyKey() { 3407 return DenseMapInfo<Instruction *>::getEmptyKey(); 3408 } 3409 3410 static inline Instruction *getTombstoneKey() { 3411 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3412 } 3413 3414 static unsigned getHashValue(const Instruction *I) { 3415 assert(canHandle(I) && "Unknown instruction!"); 3416 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3417 I->value_op_end())); 3418 } 3419 3420 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3421 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3422 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3423 return LHS == RHS; 3424 return LHS->isIdenticalTo(RHS); 3425 } 3426 }; 3427 3428 } // end anonymous namespace 3429 3430 ///Perform cse of induction variable instructions. 3431 static void cse(BasicBlock *BB) { 3432 // Perform simple cse. 3433 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3434 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3435 Instruction *In = &*I++; 3436 3437 if (!CSEDenseMapInfo::canHandle(In)) 3438 continue; 3439 3440 // Check if we can replace this instruction with any of the 3441 // visited instructions. 3442 if (Instruction *V = CSEMap.lookup(In)) { 3443 In->replaceAllUsesWith(V); 3444 In->eraseFromParent(); 3445 continue; 3446 } 3447 3448 CSEMap[In] = In; 3449 } 3450 } 3451 3452 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3453 ElementCount VF, 3454 bool &NeedToScalarize) { 3455 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3456 Function *F = CI->getCalledFunction(); 3457 Type *ScalarRetTy = CI->getType(); 3458 SmallVector<Type *, 4> Tys, ScalarTys; 3459 for (auto &ArgOp : CI->arg_operands()) 3460 ScalarTys.push_back(ArgOp->getType()); 3461 3462 // Estimate cost of scalarized vector call. The source operands are assumed 3463 // to be vectors, so we need to extract individual elements from there, 3464 // execute VF scalar calls, and then gather the result into the vector return 3465 // value. 3466 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3467 TTI::TCK_RecipThroughput); 3468 if (VF.isScalar()) 3469 return ScalarCallCost; 3470 3471 // Compute corresponding vector type for return value and arguments. 3472 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3473 for (Type *ScalarTy : ScalarTys) 3474 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3475 3476 // Compute costs of unpacking argument values for the scalar calls and 3477 // packing the return values to a vector. 3478 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3479 3480 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3481 3482 // If we can't emit a vector call for this function, then the currently found 3483 // cost is the cost we need to return. 3484 NeedToScalarize = true; 3485 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3486 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3487 3488 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3489 return Cost; 3490 3491 // If the corresponding vector cost is cheaper, return its cost. 3492 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3493 TTI::TCK_RecipThroughput); 3494 if (VectorCallCost < Cost) { 3495 NeedToScalarize = false; 3496 return VectorCallCost; 3497 } 3498 return Cost; 3499 } 3500 3501 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3502 ElementCount VF) { 3503 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3504 assert(ID && "Expected intrinsic call!"); 3505 3506 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3507 return TTI.getIntrinsicInstrCost(CostAttrs, 3508 TargetTransformInfo::TCK_RecipThroughput); 3509 } 3510 3511 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3512 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3513 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3514 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3515 } 3516 3517 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3518 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3519 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3520 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3521 } 3522 3523 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3524 // For every instruction `I` in MinBWs, truncate the operands, create a 3525 // truncated version of `I` and reextend its result. InstCombine runs 3526 // later and will remove any ext/trunc pairs. 3527 SmallPtrSet<Value *, 4> Erased; 3528 for (const auto &KV : Cost->getMinimalBitwidths()) { 3529 // If the value wasn't vectorized, we must maintain the original scalar 3530 // type. The absence of the value from VectorLoopValueMap indicates that it 3531 // wasn't vectorized. 3532 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3533 continue; 3534 for (unsigned Part = 0; Part < UF; ++Part) { 3535 Value *I = getOrCreateVectorValue(KV.first, Part); 3536 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3537 continue; 3538 Type *OriginalTy = I->getType(); 3539 Type *ScalarTruncatedTy = 3540 IntegerType::get(OriginalTy->getContext(), KV.second); 3541 auto *TruncatedTy = FixedVectorType::get( 3542 ScalarTruncatedTy, 3543 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3544 if (TruncatedTy == OriginalTy) 3545 continue; 3546 3547 IRBuilder<> B(cast<Instruction>(I)); 3548 auto ShrinkOperand = [&](Value *V) -> Value * { 3549 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3550 if (ZI->getSrcTy() == TruncatedTy) 3551 return ZI->getOperand(0); 3552 return B.CreateZExtOrTrunc(V, TruncatedTy); 3553 }; 3554 3555 // The actual instruction modification depends on the instruction type, 3556 // unfortunately. 3557 Value *NewI = nullptr; 3558 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3559 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3560 ShrinkOperand(BO->getOperand(1))); 3561 3562 // Any wrapping introduced by shrinking this operation shouldn't be 3563 // considered undefined behavior. So, we can't unconditionally copy 3564 // arithmetic wrapping flags to NewI. 3565 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3566 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3567 NewI = 3568 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3569 ShrinkOperand(CI->getOperand(1))); 3570 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3571 NewI = B.CreateSelect(SI->getCondition(), 3572 ShrinkOperand(SI->getTrueValue()), 3573 ShrinkOperand(SI->getFalseValue())); 3574 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3575 switch (CI->getOpcode()) { 3576 default: 3577 llvm_unreachable("Unhandled cast!"); 3578 case Instruction::Trunc: 3579 NewI = ShrinkOperand(CI->getOperand(0)); 3580 break; 3581 case Instruction::SExt: 3582 NewI = B.CreateSExtOrTrunc( 3583 CI->getOperand(0), 3584 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3585 break; 3586 case Instruction::ZExt: 3587 NewI = B.CreateZExtOrTrunc( 3588 CI->getOperand(0), 3589 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3590 break; 3591 } 3592 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3593 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3594 ->getNumElements(); 3595 auto *O0 = B.CreateZExtOrTrunc( 3596 SI->getOperand(0), 3597 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3598 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3599 ->getNumElements(); 3600 auto *O1 = B.CreateZExtOrTrunc( 3601 SI->getOperand(1), 3602 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3603 3604 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3605 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3606 // Don't do anything with the operands, just extend the result. 3607 continue; 3608 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3609 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3610 ->getNumElements(); 3611 auto *O0 = B.CreateZExtOrTrunc( 3612 IE->getOperand(0), 3613 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3617 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3618 ->getNumElements(); 3619 auto *O0 = B.CreateZExtOrTrunc( 3620 EE->getOperand(0), 3621 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3622 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3623 } else { 3624 // If we don't know what to do, be conservative and don't do anything. 3625 continue; 3626 } 3627 3628 // Lastly, extend the result. 3629 NewI->takeName(cast<Instruction>(I)); 3630 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3631 I->replaceAllUsesWith(Res); 3632 cast<Instruction>(I)->eraseFromParent(); 3633 Erased.insert(I); 3634 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3635 } 3636 } 3637 3638 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3639 for (const auto &KV : Cost->getMinimalBitwidths()) { 3640 // If the value wasn't vectorized, we must maintain the original scalar 3641 // type. The absence of the value from VectorLoopValueMap indicates that it 3642 // wasn't vectorized. 3643 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3644 continue; 3645 for (unsigned Part = 0; Part < UF; ++Part) { 3646 Value *I = getOrCreateVectorValue(KV.first, Part); 3647 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3648 if (Inst && Inst->use_empty()) { 3649 Value *NewI = Inst->getOperand(0); 3650 Inst->eraseFromParent(); 3651 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3652 } 3653 } 3654 } 3655 } 3656 3657 void InnerLoopVectorizer::fixVectorizedLoop() { 3658 // Insert truncates and extends for any truncated instructions as hints to 3659 // InstCombine. 3660 if (VF.isVector()) 3661 truncateToMinimalBitwidths(); 3662 3663 // Fix widened non-induction PHIs by setting up the PHI operands. 3664 if (OrigPHIsToFix.size()) { 3665 assert(EnableVPlanNativePath && 3666 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3667 fixNonInductionPHIs(); 3668 } 3669 3670 // At this point every instruction in the original loop is widened to a 3671 // vector form. Now we need to fix the recurrences in the loop. These PHI 3672 // nodes are currently empty because we did not want to introduce cycles. 3673 // This is the second stage of vectorizing recurrences. 3674 fixCrossIterationPHIs(); 3675 3676 // Forget the original basic block. 3677 PSE.getSE()->forgetLoop(OrigLoop); 3678 3679 // Fix-up external users of the induction variables. 3680 for (auto &Entry : Legal->getInductionVars()) 3681 fixupIVUsers(Entry.first, Entry.second, 3682 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3683 IVEndValues[Entry.first], LoopMiddleBlock); 3684 3685 fixLCSSAPHIs(); 3686 for (Instruction *PI : PredicatedInstructions) 3687 sinkScalarOperands(&*PI); 3688 3689 // Remove redundant induction instructions. 3690 cse(LoopVectorBody); 3691 3692 // Set/update profile weights for the vector and remainder loops as original 3693 // loop iterations are now distributed among them. Note that original loop 3694 // represented by LoopScalarBody becomes remainder loop after vectorization. 3695 // 3696 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3697 // end up getting slightly roughened result but that should be OK since 3698 // profile is not inherently precise anyway. Note also possible bypass of 3699 // vector code caused by legality checks is ignored, assigning all the weight 3700 // to the vector loop, optimistically. 3701 assert(!VF.isScalable() && 3702 "cannot use scalable ElementCount to determine unroll factor"); 3703 setProfileInfoAfterUnrolling( 3704 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3705 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3706 } 3707 3708 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3709 // In order to support recurrences we need to be able to vectorize Phi nodes. 3710 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3711 // stage #2: We now need to fix the recurrences by adding incoming edges to 3712 // the currently empty PHI nodes. At this point every instruction in the 3713 // original loop is widened to a vector form so we can use them to construct 3714 // the incoming edges. 3715 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3716 // Handle first-order recurrences and reductions that need to be fixed. 3717 if (Legal->isFirstOrderRecurrence(&Phi)) 3718 fixFirstOrderRecurrence(&Phi); 3719 else if (Legal->isReductionVariable(&Phi)) 3720 fixReduction(&Phi); 3721 } 3722 } 3723 3724 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3725 // This is the second phase of vectorizing first-order recurrences. An 3726 // overview of the transformation is described below. Suppose we have the 3727 // following loop. 3728 // 3729 // for (int i = 0; i < n; ++i) 3730 // b[i] = a[i] - a[i - 1]; 3731 // 3732 // There is a first-order recurrence on "a". For this loop, the shorthand 3733 // scalar IR looks like: 3734 // 3735 // scalar.ph: 3736 // s_init = a[-1] 3737 // br scalar.body 3738 // 3739 // scalar.body: 3740 // i = phi [0, scalar.ph], [i+1, scalar.body] 3741 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3742 // s2 = a[i] 3743 // b[i] = s2 - s1 3744 // br cond, scalar.body, ... 3745 // 3746 // In this example, s1 is a recurrence because it's value depends on the 3747 // previous iteration. In the first phase of vectorization, we created a 3748 // temporary value for s1. We now complete the vectorization and produce the 3749 // shorthand vector IR shown below (for VF = 4, UF = 1). 3750 // 3751 // vector.ph: 3752 // v_init = vector(..., ..., ..., a[-1]) 3753 // br vector.body 3754 // 3755 // vector.body 3756 // i = phi [0, vector.ph], [i+4, vector.body] 3757 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3758 // v2 = a[i, i+1, i+2, i+3]; 3759 // v3 = vector(v1(3), v2(0, 1, 2)) 3760 // b[i, i+1, i+2, i+3] = v2 - v3 3761 // br cond, vector.body, middle.block 3762 // 3763 // middle.block: 3764 // x = v2(3) 3765 // br scalar.ph 3766 // 3767 // scalar.ph: 3768 // s_init = phi [x, middle.block], [a[-1], otherwise] 3769 // br scalar.body 3770 // 3771 // After execution completes the vector loop, we extract the next value of 3772 // the recurrence (x) to use as the initial value in the scalar loop. 3773 3774 // Get the original loop preheader and single loop latch. 3775 auto *Preheader = OrigLoop->getLoopPreheader(); 3776 auto *Latch = OrigLoop->getLoopLatch(); 3777 3778 // Get the initial and previous values of the scalar recurrence. 3779 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3780 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3781 3782 // Create a vector from the initial value. 3783 auto *VectorInit = ScalarInit; 3784 if (VF.isVector()) { 3785 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3786 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3787 VectorInit = Builder.CreateInsertElement( 3788 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3789 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3790 } 3791 3792 // We constructed a temporary phi node in the first phase of vectorization. 3793 // This phi node will eventually be deleted. 3794 Builder.SetInsertPoint( 3795 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3796 3797 // Create a phi node for the new recurrence. The current value will either be 3798 // the initial value inserted into a vector or loop-varying vector value. 3799 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3800 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3801 3802 // Get the vectorized previous value of the last part UF - 1. It appears last 3803 // among all unrolled iterations, due to the order of their construction. 3804 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3805 3806 // Find and set the insertion point after the previous value if it is an 3807 // instruction. 3808 BasicBlock::iterator InsertPt; 3809 // Note that the previous value may have been constant-folded so it is not 3810 // guaranteed to be an instruction in the vector loop. 3811 // FIXME: Loop invariant values do not form recurrences. We should deal with 3812 // them earlier. 3813 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3814 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3815 else { 3816 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3817 if (isa<PHINode>(PreviousLastPart)) 3818 // If the previous value is a phi node, we should insert after all the phi 3819 // nodes in the block containing the PHI to avoid breaking basic block 3820 // verification. Note that the basic block may be different to 3821 // LoopVectorBody, in case we predicate the loop. 3822 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3823 else 3824 InsertPt = ++PreviousInst->getIterator(); 3825 } 3826 Builder.SetInsertPoint(&*InsertPt); 3827 3828 // We will construct a vector for the recurrence by combining the values for 3829 // the current and previous iterations. This is the required shuffle mask. 3830 assert(!VF.isScalable()); 3831 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3832 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3833 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3834 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3835 3836 // The vector from which to take the initial value for the current iteration 3837 // (actual or unrolled). Initially, this is the vector phi node. 3838 Value *Incoming = VecPhi; 3839 3840 // Shuffle the current and previous vector and update the vector parts. 3841 for (unsigned Part = 0; Part < UF; ++Part) { 3842 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3843 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3844 auto *Shuffle = 3845 VF.isVector() 3846 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3847 : Incoming; 3848 PhiPart->replaceAllUsesWith(Shuffle); 3849 cast<Instruction>(PhiPart)->eraseFromParent(); 3850 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3851 Incoming = PreviousPart; 3852 } 3853 3854 // Fix the latch value of the new recurrence in the vector loop. 3855 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3856 3857 // Extract the last vector element in the middle block. This will be the 3858 // initial value for the recurrence when jumping to the scalar loop. 3859 auto *ExtractForScalar = Incoming; 3860 if (VF.isVector()) { 3861 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3862 ExtractForScalar = Builder.CreateExtractElement( 3863 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3864 "vector.recur.extract"); 3865 } 3866 // Extract the second last element in the middle block if the 3867 // Phi is used outside the loop. We need to extract the phi itself 3868 // and not the last element (the phi update in the current iteration). This 3869 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3870 // when the scalar loop is not run at all. 3871 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3872 if (VF.isVector()) 3873 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3874 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3875 "vector.recur.extract.for.phi"); 3876 // When loop is unrolled without vectorizing, initialize 3877 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3878 // `Incoming`. This is analogous to the vectorized case above: extracting the 3879 // second last element when VF > 1. 3880 else if (UF > 1) 3881 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3882 3883 // Fix the initial value of the original recurrence in the scalar loop. 3884 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3885 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3886 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3887 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3888 Start->addIncoming(Incoming, BB); 3889 } 3890 3891 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3892 Phi->setName("scalar.recur"); 3893 3894 // Finally, fix users of the recurrence outside the loop. The users will need 3895 // either the last value of the scalar recurrence or the last value of the 3896 // vector recurrence we extracted in the middle block. Since the loop is in 3897 // LCSSA form, we just need to find all the phi nodes for the original scalar 3898 // recurrence in the exit block, and then add an edge for the middle block. 3899 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3900 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3901 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3902 } 3903 } 3904 } 3905 3906 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3907 Constant *Zero = Builder.getInt32(0); 3908 3909 // Get it's reduction variable descriptor. 3910 assert(Legal->isReductionVariable(Phi) && 3911 "Unable to find the reduction variable"); 3912 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3913 3914 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3915 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3916 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3917 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3918 RdxDesc.getMinMaxRecurrenceKind(); 3919 setDebugLocFromInst(Builder, ReductionStartValue); 3920 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3921 3922 // We need to generate a reduction vector from the incoming scalar. 3923 // To do so, we need to generate the 'identity' vector and override 3924 // one of the elements with the incoming scalar reduction. We need 3925 // to do it in the vector-loop preheader. 3926 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3927 3928 // This is the vector-clone of the value that leaves the loop. 3929 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3930 3931 // Find the reduction identity variable. Zero for addition, or, xor, 3932 // one for multiplication, -1 for And. 3933 Value *Identity; 3934 Value *VectorStart; 3935 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3936 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3937 // MinMax reduction have the start value as their identify. 3938 if (VF.isScalar() || IsInLoopReductionPhi) { 3939 VectorStart = Identity = ReductionStartValue; 3940 } else { 3941 VectorStart = Identity = 3942 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3943 } 3944 } else { 3945 // Handle other reduction kinds: 3946 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3947 RK, MinMaxKind, VecTy->getScalarType()); 3948 if (VF.isScalar() || IsInLoopReductionPhi) { 3949 Identity = Iden; 3950 // This vector is the Identity vector where the first element is the 3951 // incoming scalar reduction. 3952 VectorStart = ReductionStartValue; 3953 } else { 3954 Identity = ConstantVector::getSplat(VF, Iden); 3955 3956 // This vector is the Identity vector where the first element is the 3957 // incoming scalar reduction. 3958 VectorStart = 3959 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3960 } 3961 } 3962 3963 // Wrap flags are in general invalid after vectorization, clear them. 3964 clearReductionWrapFlags(RdxDesc); 3965 3966 // Fix the vector-loop phi. 3967 3968 // Reductions do not have to start at zero. They can start with 3969 // any loop invariant values. 3970 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3971 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3972 3973 for (unsigned Part = 0; Part < UF; ++Part) { 3974 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3975 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3976 // Make sure to add the reduction start value only to the 3977 // first unroll part. 3978 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3979 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3980 cast<PHINode>(VecRdxPhi) 3981 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3982 } 3983 3984 // Before each round, move the insertion point right between 3985 // the PHIs and the values we are going to write. 3986 // This allows us to write both PHINodes and the extractelement 3987 // instructions. 3988 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3989 3990 setDebugLocFromInst(Builder, LoopExitInst); 3991 3992 // If tail is folded by masking, the vector value to leave the loop should be 3993 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3994 // instead of the former. For an inloop reduction the reduction will already 3995 // be predicated, and does not need to be handled here. 3996 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 3997 for (unsigned Part = 0; Part < UF; ++Part) { 3998 Value *VecLoopExitInst = 3999 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4000 Value *Sel = nullptr; 4001 for (User *U : VecLoopExitInst->users()) { 4002 if (isa<SelectInst>(U)) { 4003 assert(!Sel && "Reduction exit feeding two selects"); 4004 Sel = U; 4005 } else 4006 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4007 } 4008 assert(Sel && "Reduction exit feeds no select"); 4009 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4010 4011 // If the target can create a predicated operator for the reduction at no 4012 // extra cost in the loop (for example a predicated vadd), it can be 4013 // cheaper for the select to remain in the loop than be sunk out of it, 4014 // and so use the select value for the phi instead of the old 4015 // LoopExitValue. 4016 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4017 if (PreferPredicatedReductionSelect || 4018 TTI->preferPredicatedReductionSelect( 4019 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4020 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4021 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4022 VecRdxPhi->setIncomingValueForBlock( 4023 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4024 } 4025 } 4026 } 4027 4028 // If the vector reduction can be performed in a smaller type, we truncate 4029 // then extend the loop exit value to enable InstCombine to evaluate the 4030 // entire expression in the smaller type. 4031 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4032 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4033 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4034 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4035 Builder.SetInsertPoint( 4036 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4037 VectorParts RdxParts(UF); 4038 for (unsigned Part = 0; Part < UF; ++Part) { 4039 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4040 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4041 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4042 : Builder.CreateZExt(Trunc, VecTy); 4043 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4044 UI != RdxParts[Part]->user_end();) 4045 if (*UI != Trunc) { 4046 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4047 RdxParts[Part] = Extnd; 4048 } else { 4049 ++UI; 4050 } 4051 } 4052 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4053 for (unsigned Part = 0; Part < UF; ++Part) { 4054 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4055 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4056 } 4057 } 4058 4059 // Reduce all of the unrolled parts into a single vector. 4060 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4061 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4062 4063 // The middle block terminator has already been assigned a DebugLoc here (the 4064 // OrigLoop's single latch terminator). We want the whole middle block to 4065 // appear to execute on this line because: (a) it is all compiler generated, 4066 // (b) these instructions are always executed after evaluating the latch 4067 // conditional branch, and (c) other passes may add new predecessors which 4068 // terminate on this line. This is the easiest way to ensure we don't 4069 // accidentally cause an extra step back into the loop while debugging. 4070 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4071 for (unsigned Part = 1; Part < UF; ++Part) { 4072 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4073 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4074 // Floating point operations had to be 'fast' to enable the reduction. 4075 ReducedPartRdx = addFastMathFlag( 4076 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4077 ReducedPartRdx, "bin.rdx"), 4078 RdxDesc.getFastMathFlags()); 4079 else 4080 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4081 RdxPart); 4082 } 4083 4084 // Create the reduction after the loop. Note that inloop reductions create the 4085 // target reduction in the loop using a Reduction recipe. 4086 if (VF.isVector() && !IsInLoopReductionPhi) { 4087 bool NoNaN = Legal->hasFunNoNaNAttr(); 4088 ReducedPartRdx = 4089 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4090 // If the reduction can be performed in a smaller type, we need to extend 4091 // the reduction to the wider type before we branch to the original loop. 4092 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4093 ReducedPartRdx = 4094 RdxDesc.isSigned() 4095 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4096 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4097 } 4098 4099 // Create a phi node that merges control-flow from the backedge-taken check 4100 // block and the middle block. 4101 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4102 LoopScalarPreHeader->getTerminator()); 4103 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4104 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4105 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4106 4107 // Now, we need to fix the users of the reduction variable 4108 // inside and outside of the scalar remainder loop. 4109 // We know that the loop is in LCSSA form. We need to update the 4110 // PHI nodes in the exit blocks. 4111 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4112 // All PHINodes need to have a single entry edge, or two if 4113 // we already fixed them. 4114 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4115 4116 // We found a reduction value exit-PHI. Update it with the 4117 // incoming bypass edge. 4118 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4119 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4120 } // end of the LCSSA phi scan. 4121 4122 // Fix the scalar loop reduction variable with the incoming reduction sum 4123 // from the vector body and from the backedge value. 4124 int IncomingEdgeBlockIdx = 4125 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4126 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4127 // Pick the other block. 4128 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4129 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4130 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4131 } 4132 4133 void InnerLoopVectorizer::clearReductionWrapFlags( 4134 RecurrenceDescriptor &RdxDesc) { 4135 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4136 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4137 RK != RecurrenceDescriptor::RK_IntegerMult) 4138 return; 4139 4140 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4141 assert(LoopExitInstr && "null loop exit instruction"); 4142 SmallVector<Instruction *, 8> Worklist; 4143 SmallPtrSet<Instruction *, 8> Visited; 4144 Worklist.push_back(LoopExitInstr); 4145 Visited.insert(LoopExitInstr); 4146 4147 while (!Worklist.empty()) { 4148 Instruction *Cur = Worklist.pop_back_val(); 4149 if (isa<OverflowingBinaryOperator>(Cur)) 4150 for (unsigned Part = 0; Part < UF; ++Part) { 4151 Value *V = getOrCreateVectorValue(Cur, Part); 4152 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4153 } 4154 4155 for (User *U : Cur->users()) { 4156 Instruction *UI = cast<Instruction>(U); 4157 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4158 Visited.insert(UI).second) 4159 Worklist.push_back(UI); 4160 } 4161 } 4162 } 4163 4164 void InnerLoopVectorizer::fixLCSSAPHIs() { 4165 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4166 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4167 if (LCSSAPhi.getNumIncomingValues() == 1) { 4168 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4169 // Non-instruction incoming values will have only one value. 4170 unsigned LastLane = 0; 4171 if (isa<Instruction>(IncomingValue)) 4172 LastLane = Cost->isUniformAfterVectorization( 4173 cast<Instruction>(IncomingValue), VF) 4174 ? 0 4175 : VF.getKnownMinValue() - 1; 4176 // Can be a loop invariant incoming value or the last scalar value to be 4177 // extracted from the vectorized loop. 4178 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4179 Value *lastIncomingValue = 4180 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4181 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4182 } 4183 } 4184 } 4185 4186 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4187 // The basic block and loop containing the predicated instruction. 4188 auto *PredBB = PredInst->getParent(); 4189 auto *VectorLoop = LI->getLoopFor(PredBB); 4190 4191 // Initialize a worklist with the operands of the predicated instruction. 4192 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4193 4194 // Holds instructions that we need to analyze again. An instruction may be 4195 // reanalyzed if we don't yet know if we can sink it or not. 4196 SmallVector<Instruction *, 8> InstsToReanalyze; 4197 4198 // Returns true if a given use occurs in the predicated block. Phi nodes use 4199 // their operands in their corresponding predecessor blocks. 4200 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4201 auto *I = cast<Instruction>(U.getUser()); 4202 BasicBlock *BB = I->getParent(); 4203 if (auto *Phi = dyn_cast<PHINode>(I)) 4204 BB = Phi->getIncomingBlock( 4205 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4206 return BB == PredBB; 4207 }; 4208 4209 // Iteratively sink the scalarized operands of the predicated instruction 4210 // into the block we created for it. When an instruction is sunk, it's 4211 // operands are then added to the worklist. The algorithm ends after one pass 4212 // through the worklist doesn't sink a single instruction. 4213 bool Changed; 4214 do { 4215 // Add the instructions that need to be reanalyzed to the worklist, and 4216 // reset the changed indicator. 4217 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4218 InstsToReanalyze.clear(); 4219 Changed = false; 4220 4221 while (!Worklist.empty()) { 4222 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4223 4224 // We can't sink an instruction if it is a phi node, is already in the 4225 // predicated block, is not in the loop, or may have side effects. 4226 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4227 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4228 continue; 4229 4230 // It's legal to sink the instruction if all its uses occur in the 4231 // predicated block. Otherwise, there's nothing to do yet, and we may 4232 // need to reanalyze the instruction. 4233 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4234 InstsToReanalyze.push_back(I); 4235 continue; 4236 } 4237 4238 // Move the instruction to the beginning of the predicated block, and add 4239 // it's operands to the worklist. 4240 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4241 Worklist.insert(I->op_begin(), I->op_end()); 4242 4243 // The sinking may have enabled other instructions to be sunk, so we will 4244 // need to iterate. 4245 Changed = true; 4246 } 4247 } while (Changed); 4248 } 4249 4250 void InnerLoopVectorizer::fixNonInductionPHIs() { 4251 for (PHINode *OrigPhi : OrigPHIsToFix) { 4252 PHINode *NewPhi = 4253 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4254 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4255 4256 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4257 predecessors(OrigPhi->getParent())); 4258 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4259 predecessors(NewPhi->getParent())); 4260 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4261 "Scalar and Vector BB should have the same number of predecessors"); 4262 4263 // The insertion point in Builder may be invalidated by the time we get 4264 // here. Force the Builder insertion point to something valid so that we do 4265 // not run into issues during insertion point restore in 4266 // getOrCreateVectorValue calls below. 4267 Builder.SetInsertPoint(NewPhi); 4268 4269 // The predecessor order is preserved and we can rely on mapping between 4270 // scalar and vector block predecessors. 4271 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4272 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4273 4274 // When looking up the new scalar/vector values to fix up, use incoming 4275 // values from original phi. 4276 Value *ScIncV = 4277 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4278 4279 // Scalar incoming value may need a broadcast 4280 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4281 NewPhi->addIncoming(NewIncV, NewPredBB); 4282 } 4283 } 4284 } 4285 4286 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4287 unsigned UF, ElementCount VF, 4288 bool IsPtrLoopInvariant, 4289 SmallBitVector &IsIndexLoopInvariant, 4290 VPTransformState &State) { 4291 // Construct a vector GEP by widening the operands of the scalar GEP as 4292 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4293 // results in a vector of pointers when at least one operand of the GEP 4294 // is vector-typed. Thus, to keep the representation compact, we only use 4295 // vector-typed operands for loop-varying values. 4296 4297 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4298 // If we are vectorizing, but the GEP has only loop-invariant operands, 4299 // the GEP we build (by only using vector-typed operands for 4300 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4301 // produce a vector of pointers, we need to either arbitrarily pick an 4302 // operand to broadcast, or broadcast a clone of the original GEP. 4303 // Here, we broadcast a clone of the original. 4304 // 4305 // TODO: If at some point we decide to scalarize instructions having 4306 // loop-invariant operands, this special case will no longer be 4307 // required. We would add the scalarization decision to 4308 // collectLoopScalars() and teach getVectorValue() to broadcast 4309 // the lane-zero scalar value. 4310 auto *Clone = Builder.Insert(GEP->clone()); 4311 for (unsigned Part = 0; Part < UF; ++Part) { 4312 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4313 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4314 addMetadata(EntryPart, GEP); 4315 } 4316 } else { 4317 // If the GEP has at least one loop-varying operand, we are sure to 4318 // produce a vector of pointers. But if we are only unrolling, we want 4319 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4320 // produce with the code below will be scalar (if VF == 1) or vector 4321 // (otherwise). Note that for the unroll-only case, we still maintain 4322 // values in the vector mapping with initVector, as we do for other 4323 // instructions. 4324 for (unsigned Part = 0; Part < UF; ++Part) { 4325 // The pointer operand of the new GEP. If it's loop-invariant, we 4326 // won't broadcast it. 4327 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4328 : State.get(Operands.getOperand(0), Part); 4329 4330 // Collect all the indices for the new GEP. If any index is 4331 // loop-invariant, we won't broadcast it. 4332 SmallVector<Value *, 4> Indices; 4333 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4334 VPValue *Operand = Operands.getOperand(I); 4335 if (IsIndexLoopInvariant[I - 1]) 4336 Indices.push_back(State.get(Operand, {0, 0})); 4337 else 4338 Indices.push_back(State.get(Operand, Part)); 4339 } 4340 4341 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4342 // but it should be a vector, otherwise. 4343 auto *NewGEP = 4344 GEP->isInBounds() 4345 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4346 Indices) 4347 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4348 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4349 "NewGEP is not a pointer vector"); 4350 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4351 addMetadata(NewGEP, GEP); 4352 } 4353 } 4354 } 4355 4356 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4357 ElementCount VF) { 4358 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4359 PHINode *P = cast<PHINode>(PN); 4360 if (EnableVPlanNativePath) { 4361 // Currently we enter here in the VPlan-native path for non-induction 4362 // PHIs where all control flow is uniform. We simply widen these PHIs. 4363 // Create a vector phi with no operands - the vector phi operands will be 4364 // set at the end of vector code generation. 4365 Type *VecTy = 4366 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4367 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4368 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4369 OrigPHIsToFix.push_back(P); 4370 4371 return; 4372 } 4373 4374 assert(PN->getParent() == OrigLoop->getHeader() && 4375 "Non-header phis should have been handled elsewhere"); 4376 4377 // In order to support recurrences we need to be able to vectorize Phi nodes. 4378 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4379 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4380 // this value when we vectorize all of the instructions that use the PHI. 4381 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4382 for (unsigned Part = 0; Part < UF; ++Part) { 4383 // This is phase one of vectorizing PHIs. 4384 bool ScalarPHI = 4385 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4386 Type *VecTy = 4387 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4388 Value *EntryPart = PHINode::Create( 4389 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4390 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4391 } 4392 return; 4393 } 4394 4395 setDebugLocFromInst(Builder, P); 4396 4397 // This PHINode must be an induction variable. 4398 // Make sure that we know about it. 4399 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4400 4401 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4402 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4403 4404 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4405 // which can be found from the original scalar operations. 4406 switch (II.getKind()) { 4407 case InductionDescriptor::IK_NoInduction: 4408 llvm_unreachable("Unknown induction"); 4409 case InductionDescriptor::IK_IntInduction: 4410 case InductionDescriptor::IK_FpInduction: 4411 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4412 case InductionDescriptor::IK_PtrInduction: { 4413 // Handle the pointer induction variable case. 4414 assert(P->getType()->isPointerTy() && "Unexpected type."); 4415 4416 if (Cost->isScalarAfterVectorization(P, VF)) { 4417 // This is the normalized GEP that starts counting at zero. 4418 Value *PtrInd = 4419 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4420 // Determine the number of scalars we need to generate for each unroll 4421 // iteration. If the instruction is uniform, we only need to generate the 4422 // first lane. Otherwise, we generate all VF values. 4423 unsigned Lanes = 4424 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4427 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4428 Lane + Part * VF.getKnownMinValue()); 4429 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4430 Value *SclrGep = 4431 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4432 SclrGep->setName("next.gep"); 4433 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4434 } 4435 } 4436 return; 4437 } 4438 assert(isa<SCEVConstant>(II.getStep()) && 4439 "Induction step not a SCEV constant!"); 4440 Type *PhiType = II.getStep()->getType(); 4441 4442 // Build a pointer phi 4443 Value *ScalarStartValue = II.getStartValue(); 4444 Type *ScStValueType = ScalarStartValue->getType(); 4445 PHINode *NewPointerPhi = 4446 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4447 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4448 4449 // A pointer induction, performed by using a gep 4450 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4451 Instruction *InductionLoc = LoopLatch->getTerminator(); 4452 const SCEV *ScalarStep = II.getStep(); 4453 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4454 Value *ScalarStepValue = 4455 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4456 Value *InductionGEP = GetElementPtrInst::Create( 4457 ScStValueType->getPointerElementType(), NewPointerPhi, 4458 Builder.CreateMul( 4459 ScalarStepValue, 4460 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4461 "ptr.ind", InductionLoc); 4462 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4463 4464 // Create UF many actual address geps that use the pointer 4465 // phi as base and a vectorized version of the step value 4466 // (<step*0, ..., step*N>) as offset. 4467 for (unsigned Part = 0; Part < UF; ++Part) { 4468 SmallVector<Constant *, 8> Indices; 4469 // Create a vector of consecutive numbers from zero to VF. 4470 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4471 Indices.push_back( 4472 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4473 Constant *StartOffset = ConstantVector::get(Indices); 4474 4475 Value *GEP = Builder.CreateGEP( 4476 ScStValueType->getPointerElementType(), NewPointerPhi, 4477 Builder.CreateMul( 4478 StartOffset, 4479 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4480 "vector.gep")); 4481 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4482 } 4483 } 4484 } 4485 } 4486 4487 /// A helper function for checking whether an integer division-related 4488 /// instruction may divide by zero (in which case it must be predicated if 4489 /// executed conditionally in the scalar code). 4490 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4491 /// Non-zero divisors that are non compile-time constants will not be 4492 /// converted into multiplication, so we will still end up scalarizing 4493 /// the division, but can do so w/o predication. 4494 static bool mayDivideByZero(Instruction &I) { 4495 assert((I.getOpcode() == Instruction::UDiv || 4496 I.getOpcode() == Instruction::SDiv || 4497 I.getOpcode() == Instruction::URem || 4498 I.getOpcode() == Instruction::SRem) && 4499 "Unexpected instruction"); 4500 Value *Divisor = I.getOperand(1); 4501 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4502 return !CInt || CInt->isZero(); 4503 } 4504 4505 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4506 VPTransformState &State) { 4507 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4508 switch (I.getOpcode()) { 4509 case Instruction::Call: 4510 case Instruction::Br: 4511 case Instruction::PHI: 4512 case Instruction::GetElementPtr: 4513 case Instruction::Select: 4514 llvm_unreachable("This instruction is handled by a different recipe."); 4515 case Instruction::UDiv: 4516 case Instruction::SDiv: 4517 case Instruction::SRem: 4518 case Instruction::URem: 4519 case Instruction::Add: 4520 case Instruction::FAdd: 4521 case Instruction::Sub: 4522 case Instruction::FSub: 4523 case Instruction::FNeg: 4524 case Instruction::Mul: 4525 case Instruction::FMul: 4526 case Instruction::FDiv: 4527 case Instruction::FRem: 4528 case Instruction::Shl: 4529 case Instruction::LShr: 4530 case Instruction::AShr: 4531 case Instruction::And: 4532 case Instruction::Or: 4533 case Instruction::Xor: { 4534 // Just widen unops and binops. 4535 setDebugLocFromInst(Builder, &I); 4536 4537 for (unsigned Part = 0; Part < UF; ++Part) { 4538 SmallVector<Value *, 2> Ops; 4539 for (VPValue *VPOp : User.operands()) 4540 Ops.push_back(State.get(VPOp, Part)); 4541 4542 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4543 4544 if (auto *VecOp = dyn_cast<Instruction>(V)) 4545 VecOp->copyIRFlags(&I); 4546 4547 // Use this vector value for all users of the original instruction. 4548 VectorLoopValueMap.setVectorValue(&I, Part, V); 4549 addMetadata(V, &I); 4550 } 4551 4552 break; 4553 } 4554 case Instruction::ICmp: 4555 case Instruction::FCmp: { 4556 // Widen compares. Generate vector compares. 4557 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4558 auto *Cmp = cast<CmpInst>(&I); 4559 setDebugLocFromInst(Builder, Cmp); 4560 for (unsigned Part = 0; Part < UF; ++Part) { 4561 Value *A = State.get(User.getOperand(0), Part); 4562 Value *B = State.get(User.getOperand(1), Part); 4563 Value *C = nullptr; 4564 if (FCmp) { 4565 // Propagate fast math flags. 4566 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4567 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4568 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4569 } else { 4570 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4571 } 4572 VectorLoopValueMap.setVectorValue(&I, Part, C); 4573 addMetadata(C, &I); 4574 } 4575 4576 break; 4577 } 4578 4579 case Instruction::ZExt: 4580 case Instruction::SExt: 4581 case Instruction::FPToUI: 4582 case Instruction::FPToSI: 4583 case Instruction::FPExt: 4584 case Instruction::PtrToInt: 4585 case Instruction::IntToPtr: 4586 case Instruction::SIToFP: 4587 case Instruction::UIToFP: 4588 case Instruction::Trunc: 4589 case Instruction::FPTrunc: 4590 case Instruction::BitCast: { 4591 auto *CI = cast<CastInst>(&I); 4592 setDebugLocFromInst(Builder, CI); 4593 4594 /// Vectorize casts. 4595 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4596 Type *DestTy = 4597 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4598 4599 for (unsigned Part = 0; Part < UF; ++Part) { 4600 Value *A = State.get(User.getOperand(0), Part); 4601 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4602 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4603 addMetadata(Cast, &I); 4604 } 4605 break; 4606 } 4607 default: 4608 // This instruction is not vectorized by simple widening. 4609 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4610 llvm_unreachable("Unhandled instruction!"); 4611 } // end of switch. 4612 } 4613 4614 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4615 VPTransformState &State) { 4616 assert(!isa<DbgInfoIntrinsic>(I) && 4617 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4618 setDebugLocFromInst(Builder, &I); 4619 4620 Module *M = I.getParent()->getParent()->getParent(); 4621 auto *CI = cast<CallInst>(&I); 4622 4623 SmallVector<Type *, 4> Tys; 4624 for (Value *ArgOperand : CI->arg_operands()) 4625 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4626 4627 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4628 4629 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4630 // version of the instruction. 4631 // Is it beneficial to perform intrinsic call compared to lib call? 4632 bool NeedToScalarize = false; 4633 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4634 bool UseVectorIntrinsic = 4635 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4636 assert((UseVectorIntrinsic || !NeedToScalarize) && 4637 "Instruction should be scalarized elsewhere."); 4638 4639 for (unsigned Part = 0; Part < UF; ++Part) { 4640 SmallVector<Value *, 4> Args; 4641 for (auto &I : enumerate(ArgOperands.operands())) { 4642 // Some intrinsics have a scalar argument - don't replace it with a 4643 // vector. 4644 Value *Arg; 4645 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4646 Arg = State.get(I.value(), Part); 4647 else 4648 Arg = State.get(I.value(), {0, 0}); 4649 Args.push_back(Arg); 4650 } 4651 4652 Function *VectorF; 4653 if (UseVectorIntrinsic) { 4654 // Use vector version of the intrinsic. 4655 Type *TysForDecl[] = {CI->getType()}; 4656 if (VF.isVector()) { 4657 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4658 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4659 } 4660 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4661 assert(VectorF && "Can't retrieve vector intrinsic."); 4662 } else { 4663 // Use vector version of the function call. 4664 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4665 #ifndef NDEBUG 4666 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4667 "Can't create vector function."); 4668 #endif 4669 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4670 } 4671 SmallVector<OperandBundleDef, 1> OpBundles; 4672 CI->getOperandBundlesAsDefs(OpBundles); 4673 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4674 4675 if (isa<FPMathOperator>(V)) 4676 V->copyFastMathFlags(CI); 4677 4678 VectorLoopValueMap.setVectorValue(&I, Part, V); 4679 addMetadata(V, &I); 4680 } 4681 } 4682 4683 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4684 VPUser &Operands, 4685 bool InvariantCond, 4686 VPTransformState &State) { 4687 setDebugLocFromInst(Builder, &I); 4688 4689 // The condition can be loop invariant but still defined inside the 4690 // loop. This means that we can't just use the original 'cond' value. 4691 // We have to take the 'vectorized' value and pick the first lane. 4692 // Instcombine will make this a no-op. 4693 auto *InvarCond = 4694 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4695 4696 for (unsigned Part = 0; Part < UF; ++Part) { 4697 Value *Cond = 4698 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4699 Value *Op0 = State.get(Operands.getOperand(1), Part); 4700 Value *Op1 = State.get(Operands.getOperand(2), Part); 4701 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4702 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4703 addMetadata(Sel, &I); 4704 } 4705 } 4706 4707 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4708 // We should not collect Scalars more than once per VF. Right now, this 4709 // function is called from collectUniformsAndScalars(), which already does 4710 // this check. Collecting Scalars for VF=1 does not make any sense. 4711 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4712 "This function should not be visited twice for the same VF"); 4713 4714 SmallSetVector<Instruction *, 8> Worklist; 4715 4716 // These sets are used to seed the analysis with pointers used by memory 4717 // accesses that will remain scalar. 4718 SmallSetVector<Instruction *, 8> ScalarPtrs; 4719 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4720 auto *Latch = TheLoop->getLoopLatch(); 4721 4722 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4723 // The pointer operands of loads and stores will be scalar as long as the 4724 // memory access is not a gather or scatter operation. The value operand of a 4725 // store will remain scalar if the store is scalarized. 4726 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4727 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4728 assert(WideningDecision != CM_Unknown && 4729 "Widening decision should be ready at this moment"); 4730 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4731 if (Ptr == Store->getValueOperand()) 4732 return WideningDecision == CM_Scalarize; 4733 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4734 "Ptr is neither a value or pointer operand"); 4735 return WideningDecision != CM_GatherScatter; 4736 }; 4737 4738 // A helper that returns true if the given value is a bitcast or 4739 // getelementptr instruction contained in the loop. 4740 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4741 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4742 isa<GetElementPtrInst>(V)) && 4743 !TheLoop->isLoopInvariant(V); 4744 }; 4745 4746 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4747 if (!isa<PHINode>(Ptr) || 4748 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4749 return false; 4750 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4751 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4752 return false; 4753 return isScalarUse(MemAccess, Ptr); 4754 }; 4755 4756 // A helper that evaluates a memory access's use of a pointer. If the 4757 // pointer is actually the pointer induction of a loop, it is being 4758 // inserted into Worklist. If the use will be a scalar use, and the 4759 // pointer is only used by memory accesses, we place the pointer in 4760 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4761 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4762 if (isScalarPtrInduction(MemAccess, Ptr)) { 4763 Worklist.insert(cast<Instruction>(Ptr)); 4764 Instruction *Update = cast<Instruction>( 4765 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4766 Worklist.insert(Update); 4767 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4768 << "\n"); 4769 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4770 << "\n"); 4771 return; 4772 } 4773 // We only care about bitcast and getelementptr instructions contained in 4774 // the loop. 4775 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4776 return; 4777 4778 // If the pointer has already been identified as scalar (e.g., if it was 4779 // also identified as uniform), there's nothing to do. 4780 auto *I = cast<Instruction>(Ptr); 4781 if (Worklist.count(I)) 4782 return; 4783 4784 // If the use of the pointer will be a scalar use, and all users of the 4785 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4786 // place the pointer in PossibleNonScalarPtrs. 4787 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4788 return isa<LoadInst>(U) || isa<StoreInst>(U); 4789 })) 4790 ScalarPtrs.insert(I); 4791 else 4792 PossibleNonScalarPtrs.insert(I); 4793 }; 4794 4795 // We seed the scalars analysis with three classes of instructions: (1) 4796 // instructions marked uniform-after-vectorization and (2) bitcast, 4797 // getelementptr and (pointer) phi instructions used by memory accesses 4798 // requiring a scalar use. 4799 // 4800 // (1) Add to the worklist all instructions that have been identified as 4801 // uniform-after-vectorization. 4802 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4803 4804 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4805 // memory accesses requiring a scalar use. The pointer operands of loads and 4806 // stores will be scalar as long as the memory accesses is not a gather or 4807 // scatter operation. The value operand of a store will remain scalar if the 4808 // store is scalarized. 4809 for (auto *BB : TheLoop->blocks()) 4810 for (auto &I : *BB) { 4811 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4812 evaluatePtrUse(Load, Load->getPointerOperand()); 4813 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4814 evaluatePtrUse(Store, Store->getPointerOperand()); 4815 evaluatePtrUse(Store, Store->getValueOperand()); 4816 } 4817 } 4818 for (auto *I : ScalarPtrs) 4819 if (!PossibleNonScalarPtrs.count(I)) { 4820 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4821 Worklist.insert(I); 4822 } 4823 4824 // Insert the forced scalars. 4825 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4826 // induction variable when the PHI user is scalarized. 4827 auto ForcedScalar = ForcedScalars.find(VF); 4828 if (ForcedScalar != ForcedScalars.end()) 4829 for (auto *I : ForcedScalar->second) 4830 Worklist.insert(I); 4831 4832 // Expand the worklist by looking through any bitcasts and getelementptr 4833 // instructions we've already identified as scalar. This is similar to the 4834 // expansion step in collectLoopUniforms(); however, here we're only 4835 // expanding to include additional bitcasts and getelementptr instructions. 4836 unsigned Idx = 0; 4837 while (Idx != Worklist.size()) { 4838 Instruction *Dst = Worklist[Idx++]; 4839 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4840 continue; 4841 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4842 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4843 auto *J = cast<Instruction>(U); 4844 return !TheLoop->contains(J) || Worklist.count(J) || 4845 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4846 isScalarUse(J, Src)); 4847 })) { 4848 Worklist.insert(Src); 4849 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4850 } 4851 } 4852 4853 // An induction variable will remain scalar if all users of the induction 4854 // variable and induction variable update remain scalar. 4855 for (auto &Induction : Legal->getInductionVars()) { 4856 auto *Ind = Induction.first; 4857 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4858 4859 // If tail-folding is applied, the primary induction variable will be used 4860 // to feed a vector compare. 4861 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4862 continue; 4863 4864 // Determine if all users of the induction variable are scalar after 4865 // vectorization. 4866 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4867 auto *I = cast<Instruction>(U); 4868 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4869 }); 4870 if (!ScalarInd) 4871 continue; 4872 4873 // Determine if all users of the induction variable update instruction are 4874 // scalar after vectorization. 4875 auto ScalarIndUpdate = 4876 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4877 auto *I = cast<Instruction>(U); 4878 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4879 }); 4880 if (!ScalarIndUpdate) 4881 continue; 4882 4883 // The induction variable and its update instruction will remain scalar. 4884 Worklist.insert(Ind); 4885 Worklist.insert(IndUpdate); 4886 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4887 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4888 << "\n"); 4889 } 4890 4891 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4892 } 4893 4894 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4895 ElementCount VF) { 4896 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4897 if (!blockNeedsPredication(I->getParent())) 4898 return false; 4899 switch(I->getOpcode()) { 4900 default: 4901 break; 4902 case Instruction::Load: 4903 case Instruction::Store: { 4904 if (!Legal->isMaskRequired(I)) 4905 return false; 4906 auto *Ptr = getLoadStorePointerOperand(I); 4907 auto *Ty = getMemInstValueType(I); 4908 // We have already decided how to vectorize this instruction, get that 4909 // result. 4910 if (VF.isVector()) { 4911 InstWidening WideningDecision = getWideningDecision(I, VF); 4912 assert(WideningDecision != CM_Unknown && 4913 "Widening decision should be ready at this moment"); 4914 return WideningDecision == CM_Scalarize; 4915 } 4916 const Align Alignment = getLoadStoreAlignment(I); 4917 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4918 isLegalMaskedGather(Ty, Alignment)) 4919 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4920 isLegalMaskedScatter(Ty, Alignment)); 4921 } 4922 case Instruction::UDiv: 4923 case Instruction::SDiv: 4924 case Instruction::SRem: 4925 case Instruction::URem: 4926 return mayDivideByZero(*I); 4927 } 4928 return false; 4929 } 4930 4931 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4932 Instruction *I, ElementCount VF) { 4933 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4934 assert(getWideningDecision(I, VF) == CM_Unknown && 4935 "Decision should not be set yet."); 4936 auto *Group = getInterleavedAccessGroup(I); 4937 assert(Group && "Must have a group."); 4938 4939 // If the instruction's allocated size doesn't equal it's type size, it 4940 // requires padding and will be scalarized. 4941 auto &DL = I->getModule()->getDataLayout(); 4942 auto *ScalarTy = getMemInstValueType(I); 4943 if (hasIrregularType(ScalarTy, DL, VF)) 4944 return false; 4945 4946 // Check if masking is required. 4947 // A Group may need masking for one of two reasons: it resides in a block that 4948 // needs predication, or it was decided to use masking to deal with gaps. 4949 bool PredicatedAccessRequiresMasking = 4950 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4951 bool AccessWithGapsRequiresMasking = 4952 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4953 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4954 return true; 4955 4956 // If masked interleaving is required, we expect that the user/target had 4957 // enabled it, because otherwise it either wouldn't have been created or 4958 // it should have been invalidated by the CostModel. 4959 assert(useMaskedInterleavedAccesses(TTI) && 4960 "Masked interleave-groups for predicated accesses are not enabled."); 4961 4962 auto *Ty = getMemInstValueType(I); 4963 const Align Alignment = getLoadStoreAlignment(I); 4964 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4965 : TTI.isLegalMaskedStore(Ty, Alignment); 4966 } 4967 4968 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4969 Instruction *I, ElementCount VF) { 4970 // Get and ensure we have a valid memory instruction. 4971 LoadInst *LI = dyn_cast<LoadInst>(I); 4972 StoreInst *SI = dyn_cast<StoreInst>(I); 4973 assert((LI || SI) && "Invalid memory instruction"); 4974 4975 auto *Ptr = getLoadStorePointerOperand(I); 4976 4977 // In order to be widened, the pointer should be consecutive, first of all. 4978 if (!Legal->isConsecutivePtr(Ptr)) 4979 return false; 4980 4981 // If the instruction is a store located in a predicated block, it will be 4982 // scalarized. 4983 if (isScalarWithPredication(I)) 4984 return false; 4985 4986 // If the instruction's allocated size doesn't equal it's type size, it 4987 // requires padding and will be scalarized. 4988 auto &DL = I->getModule()->getDataLayout(); 4989 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4990 if (hasIrregularType(ScalarTy, DL, VF)) 4991 return false; 4992 4993 return true; 4994 } 4995 4996 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4997 // We should not collect Uniforms more than once per VF. Right now, 4998 // this function is called from collectUniformsAndScalars(), which 4999 // already does this check. Collecting Uniforms for VF=1 does not make any 5000 // sense. 5001 5002 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5003 "This function should not be visited twice for the same VF"); 5004 5005 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5006 // not analyze again. Uniforms.count(VF) will return 1. 5007 Uniforms[VF].clear(); 5008 5009 // We now know that the loop is vectorizable! 5010 // Collect instructions inside the loop that will remain uniform after 5011 // vectorization. 5012 5013 // Global values, params and instructions outside of current loop are out of 5014 // scope. 5015 auto isOutOfScope = [&](Value *V) -> bool { 5016 Instruction *I = dyn_cast<Instruction>(V); 5017 return (!I || !TheLoop->contains(I)); 5018 }; 5019 5020 SetVector<Instruction *> Worklist; 5021 BasicBlock *Latch = TheLoop->getLoopLatch(); 5022 5023 // Instructions that are scalar with predication must not be considered 5024 // uniform after vectorization, because that would create an erroneous 5025 // replicating region where only a single instance out of VF should be formed. 5026 // TODO: optimize such seldom cases if found important, see PR40816. 5027 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5028 if (isScalarWithPredication(I, VF)) { 5029 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5030 << *I << "\n"); 5031 return; 5032 } 5033 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5034 Worklist.insert(I); 5035 }; 5036 5037 // Start with the conditional branch. If the branch condition is an 5038 // instruction contained in the loop that is only used by the branch, it is 5039 // uniform. 5040 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5041 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5042 addToWorklistIfAllowed(Cmp); 5043 5044 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5045 // are pointers that are treated like consecutive pointers during 5046 // vectorization. The pointer operands of interleaved accesses are an 5047 // example. 5048 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5049 5050 // Holds pointer operands of instructions that are possibly non-uniform. 5051 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5052 5053 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5054 InstWidening WideningDecision = getWideningDecision(I, VF); 5055 assert(WideningDecision != CM_Unknown && 5056 "Widening decision should be ready at this moment"); 5057 5058 return (WideningDecision == CM_Widen || 5059 WideningDecision == CM_Widen_Reverse || 5060 WideningDecision == CM_Interleave); 5061 }; 5062 // Iterate over the instructions in the loop, and collect all 5063 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5064 // that a consecutive-like pointer operand will be scalarized, we collect it 5065 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5066 // getelementptr instruction can be used by both vectorized and scalarized 5067 // memory instructions. For example, if a loop loads and stores from the same 5068 // location, but the store is conditional, the store will be scalarized, and 5069 // the getelementptr won't remain uniform. 5070 for (auto *BB : TheLoop->blocks()) 5071 for (auto &I : *BB) { 5072 // If there's no pointer operand, there's nothing to do. 5073 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5074 if (!Ptr) 5075 continue; 5076 5077 // True if all users of Ptr are memory accesses that have Ptr as their 5078 // pointer operand. 5079 auto UsersAreMemAccesses = 5080 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5081 return getLoadStorePointerOperand(U) == Ptr; 5082 }); 5083 5084 // Ensure the memory instruction will not be scalarized or used by 5085 // gather/scatter, making its pointer operand non-uniform. If the pointer 5086 // operand is used by any instruction other than a memory access, we 5087 // conservatively assume the pointer operand may be non-uniform. 5088 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5089 PossibleNonUniformPtrs.insert(Ptr); 5090 5091 // If the memory instruction will be vectorized and its pointer operand 5092 // is consecutive-like, or interleaving - the pointer operand should 5093 // remain uniform. 5094 else 5095 ConsecutiveLikePtrs.insert(Ptr); 5096 } 5097 5098 // Add to the Worklist all consecutive and consecutive-like pointers that 5099 // aren't also identified as possibly non-uniform. 5100 for (auto *V : ConsecutiveLikePtrs) 5101 if (!PossibleNonUniformPtrs.count(V)) 5102 addToWorklistIfAllowed(V); 5103 5104 // Expand Worklist in topological order: whenever a new instruction 5105 // is added , its users should be already inside Worklist. It ensures 5106 // a uniform instruction will only be used by uniform instructions. 5107 unsigned idx = 0; 5108 while (idx != Worklist.size()) { 5109 Instruction *I = Worklist[idx++]; 5110 5111 for (auto OV : I->operand_values()) { 5112 // isOutOfScope operands cannot be uniform instructions. 5113 if (isOutOfScope(OV)) 5114 continue; 5115 // First order recurrence Phi's should typically be considered 5116 // non-uniform. 5117 auto *OP = dyn_cast<PHINode>(OV); 5118 if (OP && Legal->isFirstOrderRecurrence(OP)) 5119 continue; 5120 // If all the users of the operand are uniform, then add the 5121 // operand into the uniform worklist. 5122 auto *OI = cast<Instruction>(OV); 5123 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5124 auto *J = cast<Instruction>(U); 5125 return Worklist.count(J) || 5126 (OI == getLoadStorePointerOperand(J) && 5127 isUniformDecision(J, VF)); 5128 })) 5129 addToWorklistIfAllowed(OI); 5130 } 5131 } 5132 5133 // Returns true if Ptr is the pointer operand of a memory access instruction 5134 // I, and I is known to not require scalarization. 5135 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5136 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5137 }; 5138 5139 // For an instruction to be added into Worklist above, all its users inside 5140 // the loop should also be in Worklist. However, this condition cannot be 5141 // true for phi nodes that form a cyclic dependence. We must process phi 5142 // nodes separately. An induction variable will remain uniform if all users 5143 // of the induction variable and induction variable update remain uniform. 5144 // The code below handles both pointer and non-pointer induction variables. 5145 for (auto &Induction : Legal->getInductionVars()) { 5146 auto *Ind = Induction.first; 5147 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5148 5149 // Determine if all users of the induction variable are uniform after 5150 // vectorization. 5151 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5152 auto *I = cast<Instruction>(U); 5153 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5154 isVectorizedMemAccessUse(I, Ind); 5155 }); 5156 if (!UniformInd) 5157 continue; 5158 5159 // Determine if all users of the induction variable update instruction are 5160 // uniform after vectorization. 5161 auto UniformIndUpdate = 5162 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5163 auto *I = cast<Instruction>(U); 5164 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5165 isVectorizedMemAccessUse(I, IndUpdate); 5166 }); 5167 if (!UniformIndUpdate) 5168 continue; 5169 5170 // The induction variable and its update instruction will remain uniform. 5171 addToWorklistIfAllowed(Ind); 5172 addToWorklistIfAllowed(IndUpdate); 5173 } 5174 5175 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5176 } 5177 5178 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5179 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5180 5181 if (Legal->getRuntimePointerChecking()->Need) { 5182 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5183 "runtime pointer checks needed. Enable vectorization of this " 5184 "loop with '#pragma clang loop vectorize(enable)' when " 5185 "compiling with -Os/-Oz", 5186 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5187 return true; 5188 } 5189 5190 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5191 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5192 "runtime SCEV checks needed. Enable vectorization of this " 5193 "loop with '#pragma clang loop vectorize(enable)' when " 5194 "compiling with -Os/-Oz", 5195 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5196 return true; 5197 } 5198 5199 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5200 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5201 reportVectorizationFailure("Runtime stride check for small trip count", 5202 "runtime stride == 1 checks needed. Enable vectorization of " 5203 "this loop without such check by compiling with -Os/-Oz", 5204 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5205 return true; 5206 } 5207 5208 return false; 5209 } 5210 5211 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5212 unsigned UserIC) { 5213 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5214 // TODO: It may by useful to do since it's still likely to be dynamically 5215 // uniform if the target can skip. 5216 reportVectorizationFailure( 5217 "Not inserting runtime ptr check for divergent target", 5218 "runtime pointer checks needed. Not enabled for divergent target", 5219 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5220 return None; 5221 } 5222 5223 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5224 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5225 if (TC == 1) { 5226 reportVectorizationFailure("Single iteration (non) loop", 5227 "loop trip count is one, irrelevant for vectorization", 5228 "SingleIterationLoop", ORE, TheLoop); 5229 return None; 5230 } 5231 5232 switch (ScalarEpilogueStatus) { 5233 case CM_ScalarEpilogueAllowed: 5234 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5235 case CM_ScalarEpilogueNotNeededUsePredicate: 5236 LLVM_DEBUG( 5237 dbgs() << "LV: vector predicate hint/switch found.\n" 5238 << "LV: Not allowing scalar epilogue, creating predicated " 5239 << "vector loop.\n"); 5240 break; 5241 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5242 // fallthrough as a special case of OptForSize 5243 case CM_ScalarEpilogueNotAllowedOptSize: 5244 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5245 LLVM_DEBUG( 5246 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5247 else 5248 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5249 << "count.\n"); 5250 5251 // Bail if runtime checks are required, which are not good when optimising 5252 // for size. 5253 if (runtimeChecksRequired()) 5254 return None; 5255 break; 5256 } 5257 5258 // Now try the tail folding 5259 5260 // Invalidate interleave groups that require an epilogue if we can't mask 5261 // the interleave-group. 5262 if (!useMaskedInterleavedAccesses(TTI)) { 5263 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5264 "No decisions should have been taken at this point"); 5265 // Note: There is no need to invalidate any cost modeling decisions here, as 5266 // non where taken so far. 5267 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5268 } 5269 5270 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5271 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5272 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5273 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5274 // Accept MaxVF if we do not have a tail. 5275 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5276 return MaxVF; 5277 } 5278 5279 // If we don't know the precise trip count, or if the trip count that we 5280 // found modulo the vectorization factor is not zero, try to fold the tail 5281 // by masking. 5282 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5283 if (Legal->prepareToFoldTailByMasking()) { 5284 FoldTailByMasking = true; 5285 return MaxVF; 5286 } 5287 5288 // If there was a tail-folding hint/switch, but we can't fold the tail by 5289 // masking, fallback to a vectorization with a scalar epilogue. 5290 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5291 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5292 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5293 return None; 5294 } 5295 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5296 "scalar epilogue instead.\n"); 5297 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5298 return MaxVF; 5299 } 5300 5301 if (TC == 0) { 5302 reportVectorizationFailure( 5303 "Unable to calculate the loop count due to complex control flow", 5304 "unable to calculate the loop count due to complex control flow", 5305 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5306 return None; 5307 } 5308 5309 reportVectorizationFailure( 5310 "Cannot optimize for size and vectorize at the same time.", 5311 "cannot optimize for size and vectorize at the same time. " 5312 "Enable vectorization of this loop with '#pragma clang loop " 5313 "vectorize(enable)' when compiling with -Os/-Oz", 5314 "NoTailLoopWithOptForSize", ORE, TheLoop); 5315 return None; 5316 } 5317 5318 unsigned 5319 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5320 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5321 unsigned SmallestType, WidestType; 5322 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5323 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5324 5325 // Get the maximum safe dependence distance in bits computed by LAA. 5326 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5327 // the memory accesses that is most restrictive (involved in the smallest 5328 // dependence distance). 5329 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5330 5331 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5332 5333 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5334 // Note that both WidestRegister and WidestType may not be a powers of 2. 5335 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5336 5337 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5338 << " / " << WidestType << " bits.\n"); 5339 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5340 << WidestRegister << " bits.\n"); 5341 5342 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5343 " into one vector!"); 5344 if (MaxVectorSize == 0) { 5345 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5346 MaxVectorSize = 1; 5347 return MaxVectorSize; 5348 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5349 isPowerOf2_32(ConstTripCount)) { 5350 // We need to clamp the VF to be the ConstTripCount. There is no point in 5351 // choosing a higher viable VF as done in the loop below. 5352 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5353 << ConstTripCount << "\n"); 5354 MaxVectorSize = ConstTripCount; 5355 return MaxVectorSize; 5356 } 5357 5358 unsigned MaxVF = MaxVectorSize; 5359 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5360 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5361 // Collect all viable vectorization factors larger than the default MaxVF 5362 // (i.e. MaxVectorSize). 5363 SmallVector<ElementCount, 8> VFs; 5364 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5365 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5366 VFs.push_back(ElementCount::getFixed(VS)); 5367 5368 // For each VF calculate its register usage. 5369 auto RUs = calculateRegisterUsage(VFs); 5370 5371 // Select the largest VF which doesn't require more registers than existing 5372 // ones. 5373 for (int i = RUs.size() - 1; i >= 0; --i) { 5374 bool Selected = true; 5375 for (auto& pair : RUs[i].MaxLocalUsers) { 5376 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5377 if (pair.second > TargetNumRegisters) 5378 Selected = false; 5379 } 5380 if (Selected) { 5381 MaxVF = VFs[i].getKnownMinValue(); 5382 break; 5383 } 5384 } 5385 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5386 if (MaxVF < MinVF) { 5387 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5388 << ") with target's minimum: " << MinVF << '\n'); 5389 MaxVF = MinVF; 5390 } 5391 } 5392 } 5393 return MaxVF; 5394 } 5395 5396 VectorizationFactor 5397 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5398 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5399 const float ScalarCost = Cost; 5400 unsigned Width = 1; 5401 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5402 5403 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5404 if (ForceVectorization && MaxVF > 1) { 5405 // Ignore scalar width, because the user explicitly wants vectorization. 5406 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5407 // evaluation. 5408 Cost = std::numeric_limits<float>::max(); 5409 } 5410 5411 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5412 // Notice that the vector loop needs to be executed less times, so 5413 // we need to divide the cost of the vector loops by the width of 5414 // the vector elements. 5415 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5416 float VectorCost = C.first / (float)i; 5417 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5418 << " costs: " << (int)VectorCost << ".\n"); 5419 if (!C.second && !ForceVectorization) { 5420 LLVM_DEBUG( 5421 dbgs() << "LV: Not considering vector loop of width " << i 5422 << " because it will not generate any vector instructions.\n"); 5423 continue; 5424 } 5425 if (VectorCost < Cost) { 5426 Cost = VectorCost; 5427 Width = i; 5428 } 5429 } 5430 5431 if (!EnableCondStoresVectorization && NumPredStores) { 5432 reportVectorizationFailure("There are conditional stores.", 5433 "store that is conditionally executed prevents vectorization", 5434 "ConditionalStore", ORE, TheLoop); 5435 Width = 1; 5436 Cost = ScalarCost; 5437 } 5438 5439 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5440 << "LV: Vectorization seems to be not beneficial, " 5441 << "but was forced by a user.\n"); 5442 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5443 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5444 (unsigned)(Width * Cost)}; 5445 return Factor; 5446 } 5447 5448 std::pair<unsigned, unsigned> 5449 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5450 unsigned MinWidth = -1U; 5451 unsigned MaxWidth = 8; 5452 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5453 5454 // For each block. 5455 for (BasicBlock *BB : TheLoop->blocks()) { 5456 // For each instruction in the loop. 5457 for (Instruction &I : BB->instructionsWithoutDebug()) { 5458 Type *T = I.getType(); 5459 5460 // Skip ignored values. 5461 if (ValuesToIgnore.count(&I)) 5462 continue; 5463 5464 // Only examine Loads, Stores and PHINodes. 5465 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5466 continue; 5467 5468 // Examine PHI nodes that are reduction variables. Update the type to 5469 // account for the recurrence type. 5470 if (auto *PN = dyn_cast<PHINode>(&I)) { 5471 if (!Legal->isReductionVariable(PN)) 5472 continue; 5473 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5474 T = RdxDesc.getRecurrenceType(); 5475 } 5476 5477 // Examine the stored values. 5478 if (auto *ST = dyn_cast<StoreInst>(&I)) 5479 T = ST->getValueOperand()->getType(); 5480 5481 // Ignore loaded pointer types and stored pointer types that are not 5482 // vectorizable. 5483 // 5484 // FIXME: The check here attempts to predict whether a load or store will 5485 // be vectorized. We only know this for certain after a VF has 5486 // been selected. Here, we assume that if an access can be 5487 // vectorized, it will be. We should also look at extending this 5488 // optimization to non-pointer types. 5489 // 5490 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5491 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5492 continue; 5493 5494 MinWidth = std::min(MinWidth, 5495 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5496 MaxWidth = std::max(MaxWidth, 5497 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5498 } 5499 } 5500 5501 return {MinWidth, MaxWidth}; 5502 } 5503 5504 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5505 unsigned LoopCost) { 5506 // -- The interleave heuristics -- 5507 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5508 // There are many micro-architectural considerations that we can't predict 5509 // at this level. For example, frontend pressure (on decode or fetch) due to 5510 // code size, or the number and capabilities of the execution ports. 5511 // 5512 // We use the following heuristics to select the interleave count: 5513 // 1. If the code has reductions, then we interleave to break the cross 5514 // iteration dependency. 5515 // 2. If the loop is really small, then we interleave to reduce the loop 5516 // overhead. 5517 // 3. We don't interleave if we think that we will spill registers to memory 5518 // due to the increased register pressure. 5519 5520 if (!isScalarEpilogueAllowed()) 5521 return 1; 5522 5523 // We used the distance for the interleave count. 5524 if (Legal->getMaxSafeDepDistBytes() != -1U) 5525 return 1; 5526 5527 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5528 const bool HasReductions = !Legal->getReductionVars().empty(); 5529 // Do not interleave loops with a relatively small known or estimated trip 5530 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5531 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5532 // because with the above conditions interleaving can expose ILP and break 5533 // cross iteration dependences for reductions. 5534 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5535 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5536 return 1; 5537 5538 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5539 // We divide by these constants so assume that we have at least one 5540 // instruction that uses at least one register. 5541 for (auto& pair : R.MaxLocalUsers) { 5542 pair.second = std::max(pair.second, 1U); 5543 } 5544 5545 // We calculate the interleave count using the following formula. 5546 // Subtract the number of loop invariants from the number of available 5547 // registers. These registers are used by all of the interleaved instances. 5548 // Next, divide the remaining registers by the number of registers that is 5549 // required by the loop, in order to estimate how many parallel instances 5550 // fit without causing spills. All of this is rounded down if necessary to be 5551 // a power of two. We want power of two interleave count to simplify any 5552 // addressing operations or alignment considerations. 5553 // We also want power of two interleave counts to ensure that the induction 5554 // variable of the vector loop wraps to zero, when tail is folded by masking; 5555 // this currently happens when OptForSize, in which case IC is set to 1 above. 5556 unsigned IC = UINT_MAX; 5557 5558 for (auto& pair : R.MaxLocalUsers) { 5559 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5560 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5561 << " registers of " 5562 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5563 if (VF.isScalar()) { 5564 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5565 TargetNumRegisters = ForceTargetNumScalarRegs; 5566 } else { 5567 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5568 TargetNumRegisters = ForceTargetNumVectorRegs; 5569 } 5570 unsigned MaxLocalUsers = pair.second; 5571 unsigned LoopInvariantRegs = 0; 5572 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5573 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5574 5575 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5576 // Don't count the induction variable as interleaved. 5577 if (EnableIndVarRegisterHeur) { 5578 TmpIC = 5579 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5580 std::max(1U, (MaxLocalUsers - 1))); 5581 } 5582 5583 IC = std::min(IC, TmpIC); 5584 } 5585 5586 // Clamp the interleave ranges to reasonable counts. 5587 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5588 unsigned MaxInterleaveCount = 5589 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5590 5591 // Check if the user has overridden the max. 5592 if (VF.isScalar()) { 5593 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5594 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5595 } else { 5596 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5597 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5598 } 5599 5600 // If trip count is known or estimated compile time constant, limit the 5601 // interleave count to be less than the trip count divided by VF, provided it 5602 // is at least 1. 5603 if (BestKnownTC) { 5604 MaxInterleaveCount = 5605 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5606 // Make sure MaxInterleaveCount is greater than 0. 5607 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5608 } 5609 5610 assert(MaxInterleaveCount > 0 && 5611 "Maximum interleave count must be greater than 0"); 5612 5613 // Clamp the calculated IC to be between the 1 and the max interleave count 5614 // that the target and trip count allows. 5615 if (IC > MaxInterleaveCount) 5616 IC = MaxInterleaveCount; 5617 else 5618 // Make sure IC is greater than 0. 5619 IC = std::max(1u, IC); 5620 5621 assert(IC > 0 && "Interleave count must be greater than 0."); 5622 5623 // If we did not calculate the cost for VF (because the user selected the VF) 5624 // then we calculate the cost of VF here. 5625 if (LoopCost == 0) 5626 LoopCost = expectedCost(VF).first; 5627 5628 assert(LoopCost && "Non-zero loop cost expected"); 5629 5630 // Interleave if we vectorized this loop and there is a reduction that could 5631 // benefit from interleaving. 5632 if (VF.isVector() && HasReductions) { 5633 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5634 return IC; 5635 } 5636 5637 // Note that if we've already vectorized the loop we will have done the 5638 // runtime check and so interleaving won't require further checks. 5639 bool InterleavingRequiresRuntimePointerCheck = 5640 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5641 5642 // We want to interleave small loops in order to reduce the loop overhead and 5643 // potentially expose ILP opportunities. 5644 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5645 << "LV: IC is " << IC << '\n' 5646 << "LV: VF is " << VF.getKnownMinValue() << '\n'); 5647 const bool AggressivelyInterleaveReductions = 5648 TTI.enableAggressiveInterleaving(HasReductions); 5649 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5650 // We assume that the cost overhead is 1 and we use the cost model 5651 // to estimate the cost of the loop and interleave until the cost of the 5652 // loop overhead is about 5% of the cost of the loop. 5653 unsigned SmallIC = 5654 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5655 5656 // Interleave until store/load ports (estimated by max interleave count) are 5657 // saturated. 5658 unsigned NumStores = Legal->getNumStores(); 5659 unsigned NumLoads = Legal->getNumLoads(); 5660 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5661 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5662 5663 // If we have a scalar reduction (vector reductions are already dealt with 5664 // by this point), we can increase the critical path length if the loop 5665 // we're interleaving is inside another loop. Limit, by default to 2, so the 5666 // critical path only gets increased by one reduction operation. 5667 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5668 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5669 SmallIC = std::min(SmallIC, F); 5670 StoresIC = std::min(StoresIC, F); 5671 LoadsIC = std::min(LoadsIC, F); 5672 } 5673 5674 if (EnableLoadStoreRuntimeInterleave && 5675 std::max(StoresIC, LoadsIC) > SmallIC) { 5676 LLVM_DEBUG( 5677 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5678 return std::max(StoresIC, LoadsIC); 5679 } 5680 5681 // If there are scalar reductions and TTI has enabled aggressive 5682 // interleaving for reductions, we will interleave to expose ILP. 5683 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5684 AggressivelyInterleaveReductions) { 5685 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5686 // Interleave no less than SmallIC but not as aggressive as the normal IC 5687 // to satisfy the rare situation when resources are too limited. 5688 return std::max(IC / 2, SmallIC); 5689 } else { 5690 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5691 return SmallIC; 5692 } 5693 } 5694 5695 // Interleave if this is a large loop (small loops are already dealt with by 5696 // this point) that could benefit from interleaving. 5697 if (AggressivelyInterleaveReductions) { 5698 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5699 return IC; 5700 } 5701 5702 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5703 return 1; 5704 } 5705 5706 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5707 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5708 // This function calculates the register usage by measuring the highest number 5709 // of values that are alive at a single location. Obviously, this is a very 5710 // rough estimation. We scan the loop in a topological order in order and 5711 // assign a number to each instruction. We use RPO to ensure that defs are 5712 // met before their users. We assume that each instruction that has in-loop 5713 // users starts an interval. We record every time that an in-loop value is 5714 // used, so we have a list of the first and last occurrences of each 5715 // instruction. Next, we transpose this data structure into a multi map that 5716 // holds the list of intervals that *end* at a specific location. This multi 5717 // map allows us to perform a linear search. We scan the instructions linearly 5718 // and record each time that a new interval starts, by placing it in a set. 5719 // If we find this value in the multi-map then we remove it from the set. 5720 // The max register usage is the maximum size of the set. 5721 // We also search for instructions that are defined outside the loop, but are 5722 // used inside the loop. We need this number separately from the max-interval 5723 // usage number because when we unroll, loop-invariant values do not take 5724 // more register. 5725 LoopBlocksDFS DFS(TheLoop); 5726 DFS.perform(LI); 5727 5728 RegisterUsage RU; 5729 5730 // Each 'key' in the map opens a new interval. The values 5731 // of the map are the index of the 'last seen' usage of the 5732 // instruction that is the key. 5733 using IntervalMap = DenseMap<Instruction *, unsigned>; 5734 5735 // Maps instruction to its index. 5736 SmallVector<Instruction *, 64> IdxToInstr; 5737 // Marks the end of each interval. 5738 IntervalMap EndPoint; 5739 // Saves the list of instruction indices that are used in the loop. 5740 SmallPtrSet<Instruction *, 8> Ends; 5741 // Saves the list of values that are used in the loop but are 5742 // defined outside the loop, such as arguments and constants. 5743 SmallPtrSet<Value *, 8> LoopInvariants; 5744 5745 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5746 for (Instruction &I : BB->instructionsWithoutDebug()) { 5747 IdxToInstr.push_back(&I); 5748 5749 // Save the end location of each USE. 5750 for (Value *U : I.operands()) { 5751 auto *Instr = dyn_cast<Instruction>(U); 5752 5753 // Ignore non-instruction values such as arguments, constants, etc. 5754 if (!Instr) 5755 continue; 5756 5757 // If this instruction is outside the loop then record it and continue. 5758 if (!TheLoop->contains(Instr)) { 5759 LoopInvariants.insert(Instr); 5760 continue; 5761 } 5762 5763 // Overwrite previous end points. 5764 EndPoint[Instr] = IdxToInstr.size(); 5765 Ends.insert(Instr); 5766 } 5767 } 5768 } 5769 5770 // Saves the list of intervals that end with the index in 'key'. 5771 using InstrList = SmallVector<Instruction *, 2>; 5772 DenseMap<unsigned, InstrList> TransposeEnds; 5773 5774 // Transpose the EndPoints to a list of values that end at each index. 5775 for (auto &Interval : EndPoint) 5776 TransposeEnds[Interval.second].push_back(Interval.first); 5777 5778 SmallPtrSet<Instruction *, 8> OpenIntervals; 5779 5780 // Get the size of the widest register. 5781 unsigned MaxSafeDepDist = -1U; 5782 if (Legal->getMaxSafeDepDistBytes() != -1U) 5783 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5784 unsigned WidestRegister = 5785 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5786 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5787 5788 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5789 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5790 5791 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5792 5793 // A lambda that gets the register usage for the given type and VF. 5794 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5795 if (Ty->isTokenTy()) 5796 return 0U; 5797 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5798 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5799 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5800 WidestRegister); 5801 }; 5802 5803 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5804 Instruction *I = IdxToInstr[i]; 5805 5806 // Remove all of the instructions that end at this location. 5807 InstrList &List = TransposeEnds[i]; 5808 for (Instruction *ToRemove : List) 5809 OpenIntervals.erase(ToRemove); 5810 5811 // Ignore instructions that are never used within the loop. 5812 if (!Ends.count(I)) 5813 continue; 5814 5815 // Skip ignored values. 5816 if (ValuesToIgnore.count(I)) 5817 continue; 5818 5819 // For each VF find the maximum usage of registers. 5820 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5821 // Count the number of live intervals. 5822 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5823 5824 if (VFs[j].isScalar()) { 5825 for (auto Inst : OpenIntervals) { 5826 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5827 if (RegUsage.find(ClassID) == RegUsage.end()) 5828 RegUsage[ClassID] = 1; 5829 else 5830 RegUsage[ClassID] += 1; 5831 } 5832 } else { 5833 collectUniformsAndScalars(VFs[j]); 5834 for (auto Inst : OpenIntervals) { 5835 // Skip ignored values for VF > 1. 5836 if (VecValuesToIgnore.count(Inst)) 5837 continue; 5838 if (isScalarAfterVectorization(Inst, VFs[j])) { 5839 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5840 if (RegUsage.find(ClassID) == RegUsage.end()) 5841 RegUsage[ClassID] = 1; 5842 else 5843 RegUsage[ClassID] += 1; 5844 } else { 5845 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5846 if (RegUsage.find(ClassID) == RegUsage.end()) 5847 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5848 else 5849 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5850 } 5851 } 5852 } 5853 5854 for (auto& pair : RegUsage) { 5855 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5856 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5857 else 5858 MaxUsages[j][pair.first] = pair.second; 5859 } 5860 } 5861 5862 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5863 << OpenIntervals.size() << '\n'); 5864 5865 // Add the current instruction to the list of open intervals. 5866 OpenIntervals.insert(I); 5867 } 5868 5869 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5870 SmallMapVector<unsigned, unsigned, 4> Invariant; 5871 5872 for (auto Inst : LoopInvariants) { 5873 unsigned Usage = 5874 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5875 unsigned ClassID = 5876 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5877 if (Invariant.find(ClassID) == Invariant.end()) 5878 Invariant[ClassID] = Usage; 5879 else 5880 Invariant[ClassID] += Usage; 5881 } 5882 5883 LLVM_DEBUG({ 5884 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5885 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5886 << " item\n"; 5887 for (const auto &pair : MaxUsages[i]) { 5888 dbgs() << "LV(REG): RegisterClass: " 5889 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5890 << " registers\n"; 5891 } 5892 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5893 << " item\n"; 5894 for (const auto &pair : Invariant) { 5895 dbgs() << "LV(REG): RegisterClass: " 5896 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5897 << " registers\n"; 5898 } 5899 }); 5900 5901 RU.LoopInvariantRegs = Invariant; 5902 RU.MaxLocalUsers = MaxUsages[i]; 5903 RUs[i] = RU; 5904 } 5905 5906 return RUs; 5907 } 5908 5909 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5910 // TODO: Cost model for emulated masked load/store is completely 5911 // broken. This hack guides the cost model to use an artificially 5912 // high enough value to practically disable vectorization with such 5913 // operations, except where previously deployed legality hack allowed 5914 // using very low cost values. This is to avoid regressions coming simply 5915 // from moving "masked load/store" check from legality to cost model. 5916 // Masked Load/Gather emulation was previously never allowed. 5917 // Limited number of Masked Store/Scatter emulation was allowed. 5918 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5919 return isa<LoadInst>(I) || 5920 (isa<StoreInst>(I) && 5921 NumPredStores > NumberOfStoresToPredicate); 5922 } 5923 5924 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5925 // If we aren't vectorizing the loop, or if we've already collected the 5926 // instructions to scalarize, there's nothing to do. Collection may already 5927 // have occurred if we have a user-selected VF and are now computing the 5928 // expected cost for interleaving. 5929 if (VF.isScalar() || VF.isZero() || 5930 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5931 return; 5932 5933 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5934 // not profitable to scalarize any instructions, the presence of VF in the 5935 // map will indicate that we've analyzed it already. 5936 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5937 5938 // Find all the instructions that are scalar with predication in the loop and 5939 // determine if it would be better to not if-convert the blocks they are in. 5940 // If so, we also record the instructions to scalarize. 5941 for (BasicBlock *BB : TheLoop->blocks()) { 5942 if (!blockNeedsPredication(BB)) 5943 continue; 5944 for (Instruction &I : *BB) 5945 if (isScalarWithPredication(&I)) { 5946 ScalarCostsTy ScalarCosts; 5947 // Do not apply discount logic if hacked cost is needed 5948 // for emulated masked memrefs. 5949 if (!useEmulatedMaskMemRefHack(&I) && 5950 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5951 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5952 // Remember that BB will remain after vectorization. 5953 PredicatedBBsAfterVectorization.insert(BB); 5954 } 5955 } 5956 } 5957 5958 int LoopVectorizationCostModel::computePredInstDiscount( 5959 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5960 ElementCount VF) { 5961 assert(!isUniformAfterVectorization(PredInst, VF) && 5962 "Instruction marked uniform-after-vectorization will be predicated"); 5963 5964 // Initialize the discount to zero, meaning that the scalar version and the 5965 // vector version cost the same. 5966 int Discount = 0; 5967 5968 // Holds instructions to analyze. The instructions we visit are mapped in 5969 // ScalarCosts. Those instructions are the ones that would be scalarized if 5970 // we find that the scalar version costs less. 5971 SmallVector<Instruction *, 8> Worklist; 5972 5973 // Returns true if the given instruction can be scalarized. 5974 auto canBeScalarized = [&](Instruction *I) -> bool { 5975 // We only attempt to scalarize instructions forming a single-use chain 5976 // from the original predicated block that would otherwise be vectorized. 5977 // Although not strictly necessary, we give up on instructions we know will 5978 // already be scalar to avoid traversing chains that are unlikely to be 5979 // beneficial. 5980 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5981 isScalarAfterVectorization(I, VF)) 5982 return false; 5983 5984 // If the instruction is scalar with predication, it will be analyzed 5985 // separately. We ignore it within the context of PredInst. 5986 if (isScalarWithPredication(I)) 5987 return false; 5988 5989 // If any of the instruction's operands are uniform after vectorization, 5990 // the instruction cannot be scalarized. This prevents, for example, a 5991 // masked load from being scalarized. 5992 // 5993 // We assume we will only emit a value for lane zero of an instruction 5994 // marked uniform after vectorization, rather than VF identical values. 5995 // Thus, if we scalarize an instruction that uses a uniform, we would 5996 // create uses of values corresponding to the lanes we aren't emitting code 5997 // for. This behavior can be changed by allowing getScalarValue to clone 5998 // the lane zero values for uniforms rather than asserting. 5999 for (Use &U : I->operands()) 6000 if (auto *J = dyn_cast<Instruction>(U.get())) 6001 if (isUniformAfterVectorization(J, VF)) 6002 return false; 6003 6004 // Otherwise, we can scalarize the instruction. 6005 return true; 6006 }; 6007 6008 // Compute the expected cost discount from scalarizing the entire expression 6009 // feeding the predicated instruction. We currently only consider expressions 6010 // that are single-use instruction chains. 6011 Worklist.push_back(PredInst); 6012 while (!Worklist.empty()) { 6013 Instruction *I = Worklist.pop_back_val(); 6014 6015 // If we've already analyzed the instruction, there's nothing to do. 6016 if (ScalarCosts.find(I) != ScalarCosts.end()) 6017 continue; 6018 6019 // Compute the cost of the vector instruction. Note that this cost already 6020 // includes the scalarization overhead of the predicated instruction. 6021 unsigned VectorCost = getInstructionCost(I, VF).first; 6022 6023 // Compute the cost of the scalarized instruction. This cost is the cost of 6024 // the instruction as if it wasn't if-converted and instead remained in the 6025 // predicated block. We will scale this cost by block probability after 6026 // computing the scalarization overhead. 6027 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6028 unsigned ScalarCost = 6029 VF.getKnownMinValue() * 6030 getInstructionCost(I, ElementCount::getFixed(1)).first; 6031 6032 // Compute the scalarization overhead of needed insertelement instructions 6033 // and phi nodes. 6034 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6035 ScalarCost += TTI.getScalarizationOverhead( 6036 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6037 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6038 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6039 ScalarCost += 6040 VF.getKnownMinValue() * 6041 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6042 } 6043 6044 // Compute the scalarization overhead of needed extractelement 6045 // instructions. For each of the instruction's operands, if the operand can 6046 // be scalarized, add it to the worklist; otherwise, account for the 6047 // overhead. 6048 for (Use &U : I->operands()) 6049 if (auto *J = dyn_cast<Instruction>(U.get())) { 6050 assert(VectorType::isValidElementType(J->getType()) && 6051 "Instruction has non-scalar type"); 6052 if (canBeScalarized(J)) 6053 Worklist.push_back(J); 6054 else if (needsExtract(J, VF)) { 6055 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6056 ScalarCost += TTI.getScalarizationOverhead( 6057 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6058 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6059 } 6060 } 6061 6062 // Scale the total scalar cost by block probability. 6063 ScalarCost /= getReciprocalPredBlockProb(); 6064 6065 // Compute the discount. A non-negative discount means the vector version 6066 // of the instruction costs more, and scalarizing would be beneficial. 6067 Discount += VectorCost - ScalarCost; 6068 ScalarCosts[I] = ScalarCost; 6069 } 6070 6071 return Discount; 6072 } 6073 6074 LoopVectorizationCostModel::VectorizationCostTy 6075 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6076 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6077 VectorizationCostTy Cost; 6078 6079 // For each block. 6080 for (BasicBlock *BB : TheLoop->blocks()) { 6081 VectorizationCostTy BlockCost; 6082 6083 // For each instruction in the old loop. 6084 for (Instruction &I : BB->instructionsWithoutDebug()) { 6085 // Skip ignored values. 6086 if (ValuesToIgnore.count(&I) || 6087 (VF.isVector() && VecValuesToIgnore.count(&I))) 6088 continue; 6089 6090 VectorizationCostTy C = getInstructionCost(&I, VF); 6091 6092 // Check if we should override the cost. 6093 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6094 C.first = ForceTargetInstructionCost; 6095 6096 BlockCost.first += C.first; 6097 BlockCost.second |= C.second; 6098 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6099 << " for VF " << VF << " For instruction: " << I 6100 << '\n'); 6101 } 6102 6103 // If we are vectorizing a predicated block, it will have been 6104 // if-converted. This means that the block's instructions (aside from 6105 // stores and instructions that may divide by zero) will now be 6106 // unconditionally executed. For the scalar case, we may not always execute 6107 // the predicated block. Thus, scale the block's cost by the probability of 6108 // executing it. 6109 if (VF.isScalar() && blockNeedsPredication(BB)) 6110 BlockCost.first /= getReciprocalPredBlockProb(); 6111 6112 Cost.first += BlockCost.first; 6113 Cost.second |= BlockCost.second; 6114 } 6115 6116 return Cost; 6117 } 6118 6119 /// Gets Address Access SCEV after verifying that the access pattern 6120 /// is loop invariant except the induction variable dependence. 6121 /// 6122 /// This SCEV can be sent to the Target in order to estimate the address 6123 /// calculation cost. 6124 static const SCEV *getAddressAccessSCEV( 6125 Value *Ptr, 6126 LoopVectorizationLegality *Legal, 6127 PredicatedScalarEvolution &PSE, 6128 const Loop *TheLoop) { 6129 6130 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6131 if (!Gep) 6132 return nullptr; 6133 6134 // We are looking for a gep with all loop invariant indices except for one 6135 // which should be an induction variable. 6136 auto SE = PSE.getSE(); 6137 unsigned NumOperands = Gep->getNumOperands(); 6138 for (unsigned i = 1; i < NumOperands; ++i) { 6139 Value *Opd = Gep->getOperand(i); 6140 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6141 !Legal->isInductionVariable(Opd)) 6142 return nullptr; 6143 } 6144 6145 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6146 return PSE.getSCEV(Ptr); 6147 } 6148 6149 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6150 return Legal->hasStride(I->getOperand(0)) || 6151 Legal->hasStride(I->getOperand(1)); 6152 } 6153 6154 unsigned 6155 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6156 ElementCount VF) { 6157 assert(VF.isVector() && 6158 "Scalarization cost of instruction implies vectorization."); 6159 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6160 Type *ValTy = getMemInstValueType(I); 6161 auto SE = PSE.getSE(); 6162 6163 unsigned AS = getLoadStoreAddressSpace(I); 6164 Value *Ptr = getLoadStorePointerOperand(I); 6165 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6166 6167 // Figure out whether the access is strided and get the stride value 6168 // if it's known in compile time 6169 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6170 6171 // Get the cost of the scalar memory instruction and address computation. 6172 unsigned Cost = 6173 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6174 6175 // Don't pass *I here, since it is scalar but will actually be part of a 6176 // vectorized loop where the user of it is a vectorized instruction. 6177 const Align Alignment = getLoadStoreAlignment(I); 6178 Cost += VF.getKnownMinValue() * 6179 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6180 AS, TTI::TCK_RecipThroughput); 6181 6182 // Get the overhead of the extractelement and insertelement instructions 6183 // we might create due to scalarization. 6184 Cost += getScalarizationOverhead(I, VF); 6185 6186 // If we have a predicated store, it may not be executed for each vector 6187 // lane. Scale the cost by the probability of executing the predicated 6188 // block. 6189 if (isPredicatedInst(I)) { 6190 Cost /= getReciprocalPredBlockProb(); 6191 6192 if (useEmulatedMaskMemRefHack(I)) 6193 // Artificially setting to a high enough value to practically disable 6194 // vectorization with such operations. 6195 Cost = 3000000; 6196 } 6197 6198 return Cost; 6199 } 6200 6201 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6202 ElementCount VF) { 6203 Type *ValTy = getMemInstValueType(I); 6204 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6205 Value *Ptr = getLoadStorePointerOperand(I); 6206 unsigned AS = getLoadStoreAddressSpace(I); 6207 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6208 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6209 6210 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6211 "Stride should be 1 or -1 for consecutive memory access"); 6212 const Align Alignment = getLoadStoreAlignment(I); 6213 unsigned Cost = 0; 6214 if (Legal->isMaskRequired(I)) 6215 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6216 CostKind); 6217 else 6218 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6219 CostKind, I); 6220 6221 bool Reverse = ConsecutiveStride < 0; 6222 if (Reverse) 6223 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6224 return Cost; 6225 } 6226 6227 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6228 ElementCount VF) { 6229 Type *ValTy = getMemInstValueType(I); 6230 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6231 const Align Alignment = getLoadStoreAlignment(I); 6232 unsigned AS = getLoadStoreAddressSpace(I); 6233 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6234 if (isa<LoadInst>(I)) { 6235 return TTI.getAddressComputationCost(ValTy) + 6236 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6237 CostKind) + 6238 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6239 } 6240 StoreInst *SI = cast<StoreInst>(I); 6241 6242 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6243 return TTI.getAddressComputationCost(ValTy) + 6244 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6245 CostKind) + 6246 (isLoopInvariantStoreValue 6247 ? 0 6248 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6249 VF.getKnownMinValue() - 1)); 6250 } 6251 6252 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6253 ElementCount VF) { 6254 Type *ValTy = getMemInstValueType(I); 6255 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6256 const Align Alignment = getLoadStoreAlignment(I); 6257 const Value *Ptr = getLoadStorePointerOperand(I); 6258 6259 return TTI.getAddressComputationCost(VectorTy) + 6260 TTI.getGatherScatterOpCost( 6261 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6262 TargetTransformInfo::TCK_RecipThroughput, I); 6263 } 6264 6265 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6266 ElementCount VF) { 6267 Type *ValTy = getMemInstValueType(I); 6268 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6269 unsigned AS = getLoadStoreAddressSpace(I); 6270 6271 auto Group = getInterleavedAccessGroup(I); 6272 assert(Group && "Fail to get an interleaved access group."); 6273 6274 unsigned InterleaveFactor = Group->getFactor(); 6275 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6276 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6277 6278 // Holds the indices of existing members in an interleaved load group. 6279 // An interleaved store group doesn't need this as it doesn't allow gaps. 6280 SmallVector<unsigned, 4> Indices; 6281 if (isa<LoadInst>(I)) { 6282 for (unsigned i = 0; i < InterleaveFactor; i++) 6283 if (Group->getMember(i)) 6284 Indices.push_back(i); 6285 } 6286 6287 // Calculate the cost of the whole interleaved group. 6288 bool UseMaskForGaps = 6289 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6290 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6291 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6292 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6293 6294 if (Group->isReverse()) { 6295 // TODO: Add support for reversed masked interleaved access. 6296 assert(!Legal->isMaskRequired(I) && 6297 "Reverse masked interleaved access not supported."); 6298 Cost += Group->getNumMembers() * 6299 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6300 } 6301 return Cost; 6302 } 6303 6304 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6305 ElementCount VF) { 6306 // Calculate scalar cost only. Vectorization cost should be ready at this 6307 // moment. 6308 if (VF.isScalar()) { 6309 Type *ValTy = getMemInstValueType(I); 6310 const Align Alignment = getLoadStoreAlignment(I); 6311 unsigned AS = getLoadStoreAddressSpace(I); 6312 6313 return TTI.getAddressComputationCost(ValTy) + 6314 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6315 TTI::TCK_RecipThroughput, I); 6316 } 6317 return getWideningCost(I, VF); 6318 } 6319 6320 LoopVectorizationCostModel::VectorizationCostTy 6321 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6322 ElementCount VF) { 6323 assert(!VF.isScalable() && 6324 "the cost model is not yet implemented for scalable vectorization"); 6325 // If we know that this instruction will remain uniform, check the cost of 6326 // the scalar version. 6327 if (isUniformAfterVectorization(I, VF)) 6328 VF = ElementCount::getFixed(1); 6329 6330 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6331 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6332 6333 // Forced scalars do not have any scalarization overhead. 6334 auto ForcedScalar = ForcedScalars.find(VF); 6335 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6336 auto InstSet = ForcedScalar->second; 6337 if (InstSet.count(I)) 6338 return VectorizationCostTy( 6339 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6340 VF.getKnownMinValue()), 6341 false); 6342 } 6343 6344 Type *VectorTy; 6345 unsigned C = getInstructionCost(I, VF, VectorTy); 6346 6347 bool TypeNotScalarized = 6348 VF.isVector() && VectorTy->isVectorTy() && 6349 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6350 return VectorizationCostTy(C, TypeNotScalarized); 6351 } 6352 6353 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6354 ElementCount VF) { 6355 6356 assert(!VF.isScalable() && 6357 "cannot compute scalarization overhead for scalable vectorization"); 6358 if (VF.isScalar()) 6359 return 0; 6360 6361 unsigned Cost = 0; 6362 Type *RetTy = ToVectorTy(I->getType(), VF); 6363 if (!RetTy->isVoidTy() && 6364 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6365 Cost += TTI.getScalarizationOverhead( 6366 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6367 true, false); 6368 6369 // Some targets keep addresses scalar. 6370 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6371 return Cost; 6372 6373 // Some targets support efficient element stores. 6374 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6375 return Cost; 6376 6377 // Collect operands to consider. 6378 CallInst *CI = dyn_cast<CallInst>(I); 6379 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6380 6381 // Skip operands that do not require extraction/scalarization and do not incur 6382 // any overhead. 6383 return Cost + TTI.getOperandsScalarizationOverhead( 6384 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6385 } 6386 6387 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6388 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6389 if (VF.isScalar()) 6390 return; 6391 NumPredStores = 0; 6392 for (BasicBlock *BB : TheLoop->blocks()) { 6393 // For each instruction in the old loop. 6394 for (Instruction &I : *BB) { 6395 Value *Ptr = getLoadStorePointerOperand(&I); 6396 if (!Ptr) 6397 continue; 6398 6399 // TODO: We should generate better code and update the cost model for 6400 // predicated uniform stores. Today they are treated as any other 6401 // predicated store (see added test cases in 6402 // invariant-store-vectorization.ll). 6403 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6404 NumPredStores++; 6405 6406 if (Legal->isUniform(Ptr) && 6407 // Conditional loads and stores should be scalarized and predicated. 6408 // isScalarWithPredication cannot be used here since masked 6409 // gather/scatters are not considered scalar with predication. 6410 !Legal->blockNeedsPredication(I.getParent())) { 6411 // TODO: Avoid replicating loads and stores instead of 6412 // relying on instcombine to remove them. 6413 // Load: Scalar load + broadcast 6414 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6415 unsigned Cost = getUniformMemOpCost(&I, VF); 6416 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6417 continue; 6418 } 6419 6420 // We assume that widening is the best solution when possible. 6421 if (memoryInstructionCanBeWidened(&I, VF)) { 6422 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6423 int ConsecutiveStride = 6424 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6425 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6426 "Expected consecutive stride."); 6427 InstWidening Decision = 6428 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6429 setWideningDecision(&I, VF, Decision, Cost); 6430 continue; 6431 } 6432 6433 // Choose between Interleaving, Gather/Scatter or Scalarization. 6434 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6435 unsigned NumAccesses = 1; 6436 if (isAccessInterleaved(&I)) { 6437 auto Group = getInterleavedAccessGroup(&I); 6438 assert(Group && "Fail to get an interleaved access group."); 6439 6440 // Make one decision for the whole group. 6441 if (getWideningDecision(&I, VF) != CM_Unknown) 6442 continue; 6443 6444 NumAccesses = Group->getNumMembers(); 6445 if (interleavedAccessCanBeWidened(&I, VF)) 6446 InterleaveCost = getInterleaveGroupCost(&I, VF); 6447 } 6448 6449 unsigned GatherScatterCost = 6450 isLegalGatherOrScatter(&I) 6451 ? getGatherScatterCost(&I, VF) * NumAccesses 6452 : std::numeric_limits<unsigned>::max(); 6453 6454 unsigned ScalarizationCost = 6455 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6456 6457 // Choose better solution for the current VF, 6458 // write down this decision and use it during vectorization. 6459 unsigned Cost; 6460 InstWidening Decision; 6461 if (InterleaveCost <= GatherScatterCost && 6462 InterleaveCost < ScalarizationCost) { 6463 Decision = CM_Interleave; 6464 Cost = InterleaveCost; 6465 } else if (GatherScatterCost < ScalarizationCost) { 6466 Decision = CM_GatherScatter; 6467 Cost = GatherScatterCost; 6468 } else { 6469 Decision = CM_Scalarize; 6470 Cost = ScalarizationCost; 6471 } 6472 // If the instructions belongs to an interleave group, the whole group 6473 // receives the same decision. The whole group receives the cost, but 6474 // the cost will actually be assigned to one instruction. 6475 if (auto Group = getInterleavedAccessGroup(&I)) 6476 setWideningDecision(Group, VF, Decision, Cost); 6477 else 6478 setWideningDecision(&I, VF, Decision, Cost); 6479 } 6480 } 6481 6482 // Make sure that any load of address and any other address computation 6483 // remains scalar unless there is gather/scatter support. This avoids 6484 // inevitable extracts into address registers, and also has the benefit of 6485 // activating LSR more, since that pass can't optimize vectorized 6486 // addresses. 6487 if (TTI.prefersVectorizedAddressing()) 6488 return; 6489 6490 // Start with all scalar pointer uses. 6491 SmallPtrSet<Instruction *, 8> AddrDefs; 6492 for (BasicBlock *BB : TheLoop->blocks()) 6493 for (Instruction &I : *BB) { 6494 Instruction *PtrDef = 6495 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6496 if (PtrDef && TheLoop->contains(PtrDef) && 6497 getWideningDecision(&I, VF) != CM_GatherScatter) 6498 AddrDefs.insert(PtrDef); 6499 } 6500 6501 // Add all instructions used to generate the addresses. 6502 SmallVector<Instruction *, 4> Worklist; 6503 for (auto *I : AddrDefs) 6504 Worklist.push_back(I); 6505 while (!Worklist.empty()) { 6506 Instruction *I = Worklist.pop_back_val(); 6507 for (auto &Op : I->operands()) 6508 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6509 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6510 AddrDefs.insert(InstOp).second) 6511 Worklist.push_back(InstOp); 6512 } 6513 6514 for (auto *I : AddrDefs) { 6515 if (isa<LoadInst>(I)) { 6516 // Setting the desired widening decision should ideally be handled in 6517 // by cost functions, but since this involves the task of finding out 6518 // if the loaded register is involved in an address computation, it is 6519 // instead changed here when we know this is the case. 6520 InstWidening Decision = getWideningDecision(I, VF); 6521 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6522 // Scalarize a widened load of address. 6523 setWideningDecision( 6524 I, VF, CM_Scalarize, 6525 (VF.getKnownMinValue() * 6526 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6527 else if (auto Group = getInterleavedAccessGroup(I)) { 6528 // Scalarize an interleave group of address loads. 6529 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6530 if (Instruction *Member = Group->getMember(I)) 6531 setWideningDecision( 6532 Member, VF, CM_Scalarize, 6533 (VF.getKnownMinValue() * 6534 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6535 } 6536 } 6537 } else 6538 // Make sure I gets scalarized and a cost estimate without 6539 // scalarization overhead. 6540 ForcedScalars[VF].insert(I); 6541 } 6542 } 6543 6544 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6545 ElementCount VF, 6546 Type *&VectorTy) { 6547 Type *RetTy = I->getType(); 6548 if (canTruncateToMinimalBitwidth(I, VF)) 6549 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6550 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6551 auto SE = PSE.getSE(); 6552 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6553 6554 // TODO: We need to estimate the cost of intrinsic calls. 6555 switch (I->getOpcode()) { 6556 case Instruction::GetElementPtr: 6557 // We mark this instruction as zero-cost because the cost of GEPs in 6558 // vectorized code depends on whether the corresponding memory instruction 6559 // is scalarized or not. Therefore, we handle GEPs with the memory 6560 // instruction cost. 6561 return 0; 6562 case Instruction::Br: { 6563 // In cases of scalarized and predicated instructions, there will be VF 6564 // predicated blocks in the vectorized loop. Each branch around these 6565 // blocks requires also an extract of its vector compare i1 element. 6566 bool ScalarPredicatedBB = false; 6567 BranchInst *BI = cast<BranchInst>(I); 6568 if (VF.isVector() && BI->isConditional() && 6569 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6570 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6571 ScalarPredicatedBB = true; 6572 6573 if (ScalarPredicatedBB) { 6574 // Return cost for branches around scalarized and predicated blocks. 6575 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6576 auto *Vec_i1Ty = 6577 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6578 return (TTI.getScalarizationOverhead( 6579 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6580 false, true) + 6581 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6582 VF.getKnownMinValue())); 6583 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6584 // The back-edge branch will remain, as will all scalar branches. 6585 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6586 else 6587 // This branch will be eliminated by if-conversion. 6588 return 0; 6589 // Note: We currently assume zero cost for an unconditional branch inside 6590 // a predicated block since it will become a fall-through, although we 6591 // may decide in the future to call TTI for all branches. 6592 } 6593 case Instruction::PHI: { 6594 auto *Phi = cast<PHINode>(I); 6595 6596 // First-order recurrences are replaced by vector shuffles inside the loop. 6597 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6598 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6599 return TTI.getShuffleCost( 6600 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6601 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6602 6603 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6604 // converted into select instructions. We require N - 1 selects per phi 6605 // node, where N is the number of incoming values. 6606 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6607 return (Phi->getNumIncomingValues() - 1) * 6608 TTI.getCmpSelInstrCost( 6609 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6610 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6611 CostKind); 6612 6613 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6614 } 6615 case Instruction::UDiv: 6616 case Instruction::SDiv: 6617 case Instruction::URem: 6618 case Instruction::SRem: 6619 // If we have a predicated instruction, it may not be executed for each 6620 // vector lane. Get the scalarization cost and scale this amount by the 6621 // probability of executing the predicated block. If the instruction is not 6622 // predicated, we fall through to the next case. 6623 if (VF.isVector() && isScalarWithPredication(I)) { 6624 unsigned Cost = 0; 6625 6626 // These instructions have a non-void type, so account for the phi nodes 6627 // that we will create. This cost is likely to be zero. The phi node 6628 // cost, if any, should be scaled by the block probability because it 6629 // models a copy at the end of each predicated block. 6630 Cost += VF.getKnownMinValue() * 6631 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6632 6633 // The cost of the non-predicated instruction. 6634 Cost += VF.getKnownMinValue() * 6635 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6636 6637 // The cost of insertelement and extractelement instructions needed for 6638 // scalarization. 6639 Cost += getScalarizationOverhead(I, VF); 6640 6641 // Scale the cost by the probability of executing the predicated blocks. 6642 // This assumes the predicated block for each vector lane is equally 6643 // likely. 6644 return Cost / getReciprocalPredBlockProb(); 6645 } 6646 LLVM_FALLTHROUGH; 6647 case Instruction::Add: 6648 case Instruction::FAdd: 6649 case Instruction::Sub: 6650 case Instruction::FSub: 6651 case Instruction::Mul: 6652 case Instruction::FMul: 6653 case Instruction::FDiv: 6654 case Instruction::FRem: 6655 case Instruction::Shl: 6656 case Instruction::LShr: 6657 case Instruction::AShr: 6658 case Instruction::And: 6659 case Instruction::Or: 6660 case Instruction::Xor: { 6661 // Since we will replace the stride by 1 the multiplication should go away. 6662 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6663 return 0; 6664 // Certain instructions can be cheaper to vectorize if they have a constant 6665 // second vector operand. One example of this are shifts on x86. 6666 Value *Op2 = I->getOperand(1); 6667 TargetTransformInfo::OperandValueProperties Op2VP; 6668 TargetTransformInfo::OperandValueKind Op2VK = 6669 TTI.getOperandInfo(Op2, Op2VP); 6670 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6671 Op2VK = TargetTransformInfo::OK_UniformValue; 6672 6673 SmallVector<const Value *, 4> Operands(I->operand_values()); 6674 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6675 return N * TTI.getArithmeticInstrCost( 6676 I->getOpcode(), VectorTy, CostKind, 6677 TargetTransformInfo::OK_AnyValue, 6678 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6679 } 6680 case Instruction::FNeg: { 6681 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6682 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6683 return N * TTI.getArithmeticInstrCost( 6684 I->getOpcode(), VectorTy, CostKind, 6685 TargetTransformInfo::OK_AnyValue, 6686 TargetTransformInfo::OK_AnyValue, 6687 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6688 I->getOperand(0), I); 6689 } 6690 case Instruction::Select: { 6691 SelectInst *SI = cast<SelectInst>(I); 6692 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6693 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6694 Type *CondTy = SI->getCondition()->getType(); 6695 if (!ScalarCond) { 6696 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6697 CondTy = VectorType::get(CondTy, VF); 6698 } 6699 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6700 CostKind, I); 6701 } 6702 case Instruction::ICmp: 6703 case Instruction::FCmp: { 6704 Type *ValTy = I->getOperand(0)->getType(); 6705 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6706 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6707 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6708 VectorTy = ToVectorTy(ValTy, VF); 6709 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6710 I); 6711 } 6712 case Instruction::Store: 6713 case Instruction::Load: { 6714 ElementCount Width = VF; 6715 if (Width.isVector()) { 6716 InstWidening Decision = getWideningDecision(I, Width); 6717 assert(Decision != CM_Unknown && 6718 "CM decision should be taken at this point"); 6719 if (Decision == CM_Scalarize) 6720 Width = ElementCount::getFixed(1); 6721 } 6722 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6723 return getMemoryInstructionCost(I, VF); 6724 } 6725 case Instruction::ZExt: 6726 case Instruction::SExt: 6727 case Instruction::FPToUI: 6728 case Instruction::FPToSI: 6729 case Instruction::FPExt: 6730 case Instruction::PtrToInt: 6731 case Instruction::IntToPtr: 6732 case Instruction::SIToFP: 6733 case Instruction::UIToFP: 6734 case Instruction::Trunc: 6735 case Instruction::FPTrunc: 6736 case Instruction::BitCast: { 6737 // Computes the CastContextHint from a Load/Store instruction. 6738 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6739 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6740 "Expected a load or a store!"); 6741 6742 if (VF.isScalar() || !TheLoop->contains(I)) 6743 return TTI::CastContextHint::Normal; 6744 6745 switch (getWideningDecision(I, VF)) { 6746 case LoopVectorizationCostModel::CM_GatherScatter: 6747 return TTI::CastContextHint::GatherScatter; 6748 case LoopVectorizationCostModel::CM_Interleave: 6749 return TTI::CastContextHint::Interleave; 6750 case LoopVectorizationCostModel::CM_Scalarize: 6751 case LoopVectorizationCostModel::CM_Widen: 6752 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6753 : TTI::CastContextHint::Normal; 6754 case LoopVectorizationCostModel::CM_Widen_Reverse: 6755 return TTI::CastContextHint::Reversed; 6756 case LoopVectorizationCostModel::CM_Unknown: 6757 llvm_unreachable("Instr did not go through cost modelling?"); 6758 } 6759 6760 llvm_unreachable("Unhandled case!"); 6761 }; 6762 6763 unsigned Opcode = I->getOpcode(); 6764 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6765 // For Trunc, the context is the only user, which must be a StoreInst. 6766 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6767 if (I->hasOneUse()) 6768 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6769 CCH = ComputeCCH(Store); 6770 } 6771 // For Z/Sext, the context is the operand, which must be a LoadInst. 6772 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6773 Opcode == Instruction::FPExt) { 6774 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6775 CCH = ComputeCCH(Load); 6776 } 6777 6778 // We optimize the truncation of induction variables having constant 6779 // integer steps. The cost of these truncations is the same as the scalar 6780 // operation. 6781 if (isOptimizableIVTruncate(I, VF)) { 6782 auto *Trunc = cast<TruncInst>(I); 6783 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6784 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6785 } 6786 6787 Type *SrcScalarTy = I->getOperand(0)->getType(); 6788 Type *SrcVecTy = 6789 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6790 if (canTruncateToMinimalBitwidth(I, VF)) { 6791 // This cast is going to be shrunk. This may remove the cast or it might 6792 // turn it into slightly different cast. For example, if MinBW == 16, 6793 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6794 // 6795 // Calculate the modified src and dest types. 6796 Type *MinVecTy = VectorTy; 6797 if (Opcode == Instruction::Trunc) { 6798 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6799 VectorTy = 6800 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6801 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6802 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6803 VectorTy = 6804 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6805 } 6806 } 6807 6808 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6809 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6810 return N * 6811 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6812 } 6813 case Instruction::Call: { 6814 bool NeedToScalarize; 6815 CallInst *CI = cast<CallInst>(I); 6816 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6817 if (getVectorIntrinsicIDForCall(CI, TLI)) 6818 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6819 return CallCost; 6820 } 6821 default: 6822 // The cost of executing VF copies of the scalar instruction. This opcode 6823 // is unknown. Assume that it is the same as 'mul'. 6824 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6825 Instruction::Mul, VectorTy, CostKind) + 6826 getScalarizationOverhead(I, VF); 6827 } // end of switch. 6828 } 6829 6830 char LoopVectorize::ID = 0; 6831 6832 static const char lv_name[] = "Loop Vectorization"; 6833 6834 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6835 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6836 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6837 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6838 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6839 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6840 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6841 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6842 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6843 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6844 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6845 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6846 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6847 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6848 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6849 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6850 6851 namespace llvm { 6852 6853 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6854 6855 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6856 bool VectorizeOnlyWhenForced) { 6857 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6858 } 6859 6860 } // end namespace llvm 6861 6862 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6863 // Check if the pointer operand of a load or store instruction is 6864 // consecutive. 6865 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6866 return Legal->isConsecutivePtr(Ptr); 6867 return false; 6868 } 6869 6870 void LoopVectorizationCostModel::collectValuesToIgnore() { 6871 // Ignore ephemeral values. 6872 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6873 6874 // Ignore type-promoting instructions we identified during reduction 6875 // detection. 6876 for (auto &Reduction : Legal->getReductionVars()) { 6877 RecurrenceDescriptor &RedDes = Reduction.second; 6878 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6879 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6880 } 6881 // Ignore type-casting instructions we identified during induction 6882 // detection. 6883 for (auto &Induction : Legal->getInductionVars()) { 6884 InductionDescriptor &IndDes = Induction.second; 6885 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6886 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6887 } 6888 } 6889 6890 void LoopVectorizationCostModel::collectInLoopReductions() { 6891 for (auto &Reduction : Legal->getReductionVars()) { 6892 PHINode *Phi = Reduction.first; 6893 RecurrenceDescriptor &RdxDesc = Reduction.second; 6894 6895 // We don't collect reductions that are type promoted (yet). 6896 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6897 continue; 6898 6899 // If the target would prefer this reduction to happen "in-loop", then we 6900 // want to record it as such. 6901 unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()); 6902 if (!PreferInLoopReductions && 6903 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 6904 TargetTransformInfo::ReductionFlags())) 6905 continue; 6906 6907 // Check that we can correctly put the reductions into the loop, by 6908 // finding the chain of operations that leads from the phi to the loop 6909 // exit value. 6910 SmallVector<Instruction *, 4> ReductionOperations = 6911 RdxDesc.getReductionOpChain(Phi, TheLoop); 6912 bool InLoop = !ReductionOperations.empty(); 6913 if (InLoop) 6914 InLoopReductionChains[Phi] = ReductionOperations; 6915 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6916 << " reduction for phi: " << *Phi << "\n"); 6917 } 6918 } 6919 6920 // TODO: we could return a pair of values that specify the max VF and 6921 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6922 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6923 // doesn't have a cost model that can choose which plan to execute if 6924 // more than one is generated. 6925 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6926 LoopVectorizationCostModel &CM) { 6927 unsigned WidestType; 6928 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6929 return WidestVectorRegBits / WidestType; 6930 } 6931 6932 VectorizationFactor 6933 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6934 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6935 ElementCount VF = UserVF; 6936 // Outer loop handling: They may require CFG and instruction level 6937 // transformations before even evaluating whether vectorization is profitable. 6938 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6939 // the vectorization pipeline. 6940 if (!OrigLoop->isInnermost()) { 6941 // If the user doesn't provide a vectorization factor, determine a 6942 // reasonable one. 6943 if (UserVF.isZero()) { 6944 VF = ElementCount::getFixed( 6945 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6946 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6947 6948 // Make sure we have a VF > 1 for stress testing. 6949 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6950 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6951 << "overriding computed VF.\n"); 6952 VF = ElementCount::getFixed(4); 6953 } 6954 } 6955 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6956 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6957 "VF needs to be a power of two"); 6958 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6959 << "VF " << VF << " to build VPlans.\n"); 6960 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6961 6962 // For VPlan build stress testing, we bail out after VPlan construction. 6963 if (VPlanBuildStressTest) 6964 return VectorizationFactor::Disabled(); 6965 6966 return {VF, 0 /*Cost*/}; 6967 } 6968 6969 LLVM_DEBUG( 6970 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6971 "VPlan-native path.\n"); 6972 return VectorizationFactor::Disabled(); 6973 } 6974 6975 Optional<VectorizationFactor> 6976 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6977 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6978 assert(OrigLoop->isInnermost() && "Inner loop expected."); 6979 Optional<unsigned> MaybeMaxVF = 6980 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6981 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6982 return None; 6983 6984 // Invalidate interleave groups if all blocks of loop will be predicated. 6985 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6986 !useMaskedInterleavedAccesses(*TTI)) { 6987 LLVM_DEBUG( 6988 dbgs() 6989 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6990 "which requires masked-interleaved support.\n"); 6991 if (CM.InterleaveInfo.invalidateGroups()) 6992 // Invalidating interleave groups also requires invalidating all decisions 6993 // based on them, which includes widening decisions and uniform and scalar 6994 // values. 6995 CM.invalidateCostModelingDecisions(); 6996 } 6997 6998 if (!UserVF.isZero()) { 6999 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7000 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7001 "VF needs to be a power of two"); 7002 // Collect the instructions (and their associated costs) that will be more 7003 // profitable to scalarize. 7004 CM.selectUserVectorizationFactor(UserVF); 7005 CM.collectInLoopReductions(); 7006 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 7007 UserVF.getKnownMinValue()); 7008 LLVM_DEBUG(printPlans(dbgs())); 7009 return {{UserVF, 0}}; 7010 } 7011 7012 unsigned MaxVF = MaybeMaxVF.getValue(); 7013 assert(MaxVF != 0 && "MaxVF is zero."); 7014 7015 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 7016 // Collect Uniform and Scalar instructions after vectorization with VF. 7017 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 7018 7019 // Collect the instructions (and their associated costs) that will be more 7020 // profitable to scalarize. 7021 if (VF > 1) 7022 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 7023 } 7024 7025 CM.collectInLoopReductions(); 7026 7027 buildVPlansWithVPRecipes(1, MaxVF); 7028 LLVM_DEBUG(printPlans(dbgs())); 7029 if (MaxVF == 1) 7030 return VectorizationFactor::Disabled(); 7031 7032 // Select the optimal vectorization factor. 7033 return CM.selectVectorizationFactor(MaxVF); 7034 } 7035 7036 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7037 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7038 << '\n'); 7039 BestVF = VF; 7040 BestUF = UF; 7041 7042 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7043 return !Plan->hasVF(VF); 7044 }); 7045 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7046 } 7047 7048 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7049 DominatorTree *DT) { 7050 // Perform the actual loop transformation. 7051 7052 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7053 VPCallbackILV CallbackILV(ILV); 7054 7055 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7056 7057 VPTransformState State{*BestVF, BestUF, LI, 7058 DT, ILV.Builder, ILV.VectorLoopValueMap, 7059 &ILV, CallbackILV}; 7060 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7061 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7062 State.CanonicalIV = ILV.Induction; 7063 7064 //===------------------------------------------------===// 7065 // 7066 // Notice: any optimization or new instruction that go 7067 // into the code below should also be implemented in 7068 // the cost-model. 7069 // 7070 //===------------------------------------------------===// 7071 7072 // 2. Copy and widen instructions from the old loop into the new loop. 7073 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7074 VPlans.front()->execute(&State); 7075 7076 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7077 // predication, updating analyses. 7078 ILV.fixVectorizedLoop(); 7079 } 7080 7081 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7082 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7083 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7084 7085 // We create new control-flow for the vectorized loop, so the original 7086 // condition will be dead after vectorization if it's only used by the 7087 // branch. 7088 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7089 if (Cmp && Cmp->hasOneUse()) { 7090 DeadInstructions.insert(Cmp); 7091 7092 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7093 for (Value *Op : Cmp->operands()) { 7094 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7095 DeadInstructions.insert(cast<Instruction>(Op)); 7096 } 7097 } 7098 7099 // We create new "steps" for induction variable updates to which the original 7100 // induction variables map. An original update instruction will be dead if 7101 // all its users except the induction variable are dead. 7102 for (auto &Induction : Legal->getInductionVars()) { 7103 PHINode *Ind = Induction.first; 7104 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7105 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7106 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7107 })) 7108 DeadInstructions.insert(IndUpdate); 7109 7110 // We record as "Dead" also the type-casting instructions we had identified 7111 // during induction analysis. We don't need any handling for them in the 7112 // vectorized loop because we have proven that, under a proper runtime 7113 // test guarding the vectorized loop, the value of the phi, and the casted 7114 // value of the phi, are the same. The last instruction in this casting chain 7115 // will get its scalar/vector/widened def from the scalar/vector/widened def 7116 // of the respective phi node. Any other casts in the induction def-use chain 7117 // have no other uses outside the phi update chain, and will be ignored. 7118 InductionDescriptor &IndDes = Induction.second; 7119 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7120 DeadInstructions.insert(Casts.begin(), Casts.end()); 7121 } 7122 } 7123 7124 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7125 7126 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7127 7128 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7129 Instruction::BinaryOps BinOp) { 7130 // When unrolling and the VF is 1, we only need to add a simple scalar. 7131 Type *Ty = Val->getType(); 7132 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7133 7134 if (Ty->isFloatingPointTy()) { 7135 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7136 7137 // Floating point operations had to be 'fast' to enable the unrolling. 7138 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7139 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7140 } 7141 Constant *C = ConstantInt::get(Ty, StartIdx); 7142 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7143 } 7144 7145 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7146 SmallVector<Metadata *, 4> MDs; 7147 // Reserve first location for self reference to the LoopID metadata node. 7148 MDs.push_back(nullptr); 7149 bool IsUnrollMetadata = false; 7150 MDNode *LoopID = L->getLoopID(); 7151 if (LoopID) { 7152 // First find existing loop unrolling disable metadata. 7153 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7154 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7155 if (MD) { 7156 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7157 IsUnrollMetadata = 7158 S && S->getString().startswith("llvm.loop.unroll.disable"); 7159 } 7160 MDs.push_back(LoopID->getOperand(i)); 7161 } 7162 } 7163 7164 if (!IsUnrollMetadata) { 7165 // Add runtime unroll disable metadata. 7166 LLVMContext &Context = L->getHeader()->getContext(); 7167 SmallVector<Metadata *, 1> DisableOperands; 7168 DisableOperands.push_back( 7169 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7170 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7171 MDs.push_back(DisableNode); 7172 MDNode *NewLoopID = MDNode::get(Context, MDs); 7173 // Set operand 0 to refer to the loop id itself. 7174 NewLoopID->replaceOperandWith(0, NewLoopID); 7175 L->setLoopID(NewLoopID); 7176 } 7177 } 7178 7179 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7180 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7181 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7182 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7183 7184 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7185 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7186 Range.End = TmpVF; 7187 break; 7188 } 7189 7190 return PredicateAtRangeStart; 7191 } 7192 7193 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7194 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7195 /// of VF's starting at a given VF and extending it as much as possible. Each 7196 /// vectorization decision can potentially shorten this sub-range during 7197 /// buildVPlan(). 7198 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7199 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7200 VFRange SubRange = {VF, MaxVF + 1}; 7201 VPlans.push_back(buildVPlan(SubRange)); 7202 VF = SubRange.End; 7203 } 7204 } 7205 7206 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7207 VPlanPtr &Plan) { 7208 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7209 7210 // Look for cached value. 7211 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7212 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7213 if (ECEntryIt != EdgeMaskCache.end()) 7214 return ECEntryIt->second; 7215 7216 VPValue *SrcMask = createBlockInMask(Src, Plan); 7217 7218 // The terminator has to be a branch inst! 7219 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7220 assert(BI && "Unexpected terminator found"); 7221 7222 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7223 return EdgeMaskCache[Edge] = SrcMask; 7224 7225 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7226 assert(EdgeMask && "No Edge Mask found for condition"); 7227 7228 if (BI->getSuccessor(0) != Dst) 7229 EdgeMask = Builder.createNot(EdgeMask); 7230 7231 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7232 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7233 7234 return EdgeMaskCache[Edge] = EdgeMask; 7235 } 7236 7237 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7238 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7239 7240 // Look for cached value. 7241 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7242 if (BCEntryIt != BlockMaskCache.end()) 7243 return BCEntryIt->second; 7244 7245 // All-one mask is modelled as no-mask following the convention for masked 7246 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7247 VPValue *BlockMask = nullptr; 7248 7249 if (OrigLoop->getHeader() == BB) { 7250 if (!CM.blockNeedsPredication(BB)) 7251 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7252 7253 // Create the block in mask as the first non-phi instruction in the block. 7254 VPBuilder::InsertPointGuard Guard(Builder); 7255 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7256 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7257 7258 // Introduce the early-exit compare IV <= BTC to form header block mask. 7259 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7260 // Start by constructing the desired canonical IV. 7261 VPValue *IV = nullptr; 7262 if (Legal->getPrimaryInduction()) 7263 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7264 else { 7265 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7266 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7267 IV = IVRecipe->getVPValue(); 7268 } 7269 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7270 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7271 7272 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7273 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7274 // as a second argument, we only pass the IV here and extract the 7275 // tripcount from the transform state where codegen of the VP instructions 7276 // happen. 7277 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7278 } else { 7279 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7280 } 7281 return BlockMaskCache[BB] = BlockMask; 7282 } 7283 7284 // This is the block mask. We OR all incoming edges. 7285 for (auto *Predecessor : predecessors(BB)) { 7286 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7287 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7288 return BlockMaskCache[BB] = EdgeMask; 7289 7290 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7291 BlockMask = EdgeMask; 7292 continue; 7293 } 7294 7295 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7296 } 7297 7298 return BlockMaskCache[BB] = BlockMask; 7299 } 7300 7301 VPWidenMemoryInstructionRecipe * 7302 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7303 VPlanPtr &Plan) { 7304 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7305 "Must be called with either a load or store"); 7306 7307 auto willWiden = [&](ElementCount VF) -> bool { 7308 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7309 if (VF.isScalar()) 7310 return false; 7311 LoopVectorizationCostModel::InstWidening Decision = 7312 CM.getWideningDecision(I, VF); 7313 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7314 "CM decision should be taken at this point."); 7315 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7316 return true; 7317 if (CM.isScalarAfterVectorization(I, VF) || 7318 CM.isProfitableToScalarize(I, VF)) 7319 return false; 7320 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7321 }; 7322 7323 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7324 return nullptr; 7325 7326 VPValue *Mask = nullptr; 7327 if (Legal->isMaskRequired(I)) 7328 Mask = createBlockInMask(I->getParent(), Plan); 7329 7330 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7331 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7332 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7333 7334 StoreInst *Store = cast<StoreInst>(I); 7335 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7336 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7337 } 7338 7339 VPWidenIntOrFpInductionRecipe * 7340 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7341 // Check if this is an integer or fp induction. If so, build the recipe that 7342 // produces its scalar and vector values. 7343 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7344 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7345 II.getKind() == InductionDescriptor::IK_FpInduction) 7346 return new VPWidenIntOrFpInductionRecipe(Phi); 7347 7348 return nullptr; 7349 } 7350 7351 VPWidenIntOrFpInductionRecipe * 7352 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7353 VFRange &Range) const { 7354 // Optimize the special case where the source is a constant integer 7355 // induction variable. Notice that we can only optimize the 'trunc' case 7356 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7357 // (c) other casts depend on pointer size. 7358 7359 // Determine whether \p K is a truncation based on an induction variable that 7360 // can be optimized. 7361 auto isOptimizableIVTruncate = 7362 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7363 return [=](ElementCount VF) -> bool { 7364 return CM.isOptimizableIVTruncate(K, VF); 7365 }; 7366 }; 7367 7368 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7369 isOptimizableIVTruncate(I), Range)) 7370 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7371 I); 7372 return nullptr; 7373 } 7374 7375 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7376 // We know that all PHIs in non-header blocks are converted into selects, so 7377 // we don't have to worry about the insertion order and we can just use the 7378 // builder. At this point we generate the predication tree. There may be 7379 // duplications since this is a simple recursive scan, but future 7380 // optimizations will clean it up. 7381 7382 SmallVector<VPValue *, 2> Operands; 7383 unsigned NumIncoming = Phi->getNumIncomingValues(); 7384 for (unsigned In = 0; In < NumIncoming; In++) { 7385 VPValue *EdgeMask = 7386 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7387 assert((EdgeMask || NumIncoming == 1) && 7388 "Multiple predecessors with one having a full mask"); 7389 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7390 if (EdgeMask) 7391 Operands.push_back(EdgeMask); 7392 } 7393 return new VPBlendRecipe(Phi, Operands); 7394 } 7395 7396 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7397 VPlan &Plan) const { 7398 7399 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7400 [this, CI](ElementCount VF) { 7401 return CM.isScalarWithPredication(CI, VF); 7402 }, 7403 Range); 7404 7405 if (IsPredicated) 7406 return nullptr; 7407 7408 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7409 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7410 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7411 return nullptr; 7412 7413 auto willWiden = [&](ElementCount VF) -> bool { 7414 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7415 // The following case may be scalarized depending on the VF. 7416 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7417 // version of the instruction. 7418 // Is it beneficial to perform intrinsic call compared to lib call? 7419 bool NeedToScalarize = false; 7420 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7421 bool UseVectorIntrinsic = 7422 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7423 return UseVectorIntrinsic || !NeedToScalarize; 7424 }; 7425 7426 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7427 return nullptr; 7428 7429 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7430 } 7431 7432 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7433 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7434 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7435 // Instruction should be widened, unless it is scalar after vectorization, 7436 // scalarization is profitable or it is predicated. 7437 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7438 return CM.isScalarAfterVectorization(I, VF) || 7439 CM.isProfitableToScalarize(I, VF) || 7440 CM.isScalarWithPredication(I, VF); 7441 }; 7442 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7443 Range); 7444 } 7445 7446 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7447 auto IsVectorizableOpcode = [](unsigned Opcode) { 7448 switch (Opcode) { 7449 case Instruction::Add: 7450 case Instruction::And: 7451 case Instruction::AShr: 7452 case Instruction::BitCast: 7453 case Instruction::FAdd: 7454 case Instruction::FCmp: 7455 case Instruction::FDiv: 7456 case Instruction::FMul: 7457 case Instruction::FNeg: 7458 case Instruction::FPExt: 7459 case Instruction::FPToSI: 7460 case Instruction::FPToUI: 7461 case Instruction::FPTrunc: 7462 case Instruction::FRem: 7463 case Instruction::FSub: 7464 case Instruction::ICmp: 7465 case Instruction::IntToPtr: 7466 case Instruction::LShr: 7467 case Instruction::Mul: 7468 case Instruction::Or: 7469 case Instruction::PtrToInt: 7470 case Instruction::SDiv: 7471 case Instruction::Select: 7472 case Instruction::SExt: 7473 case Instruction::Shl: 7474 case Instruction::SIToFP: 7475 case Instruction::SRem: 7476 case Instruction::Sub: 7477 case Instruction::Trunc: 7478 case Instruction::UDiv: 7479 case Instruction::UIToFP: 7480 case Instruction::URem: 7481 case Instruction::Xor: 7482 case Instruction::ZExt: 7483 return true; 7484 } 7485 return false; 7486 }; 7487 7488 if (!IsVectorizableOpcode(I->getOpcode())) 7489 return nullptr; 7490 7491 // Success: widen this instruction. 7492 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7493 } 7494 7495 VPBasicBlock *VPRecipeBuilder::handleReplication( 7496 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7497 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7498 VPlanPtr &Plan) { 7499 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7500 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7501 Range); 7502 7503 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7504 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7505 Range); 7506 7507 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7508 IsUniform, IsPredicated); 7509 setRecipe(I, Recipe); 7510 7511 // Find if I uses a predicated instruction. If so, it will use its scalar 7512 // value. Avoid hoisting the insert-element which packs the scalar value into 7513 // a vector value, as that happens iff all users use the vector value. 7514 for (auto &Op : I->operands()) 7515 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7516 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7517 PredInst2Recipe[PredInst]->setAlsoPack(false); 7518 7519 // Finalize the recipe for Instr, first if it is not predicated. 7520 if (!IsPredicated) { 7521 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7522 VPBB->appendRecipe(Recipe); 7523 return VPBB; 7524 } 7525 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7526 assert(VPBB->getSuccessors().empty() && 7527 "VPBB has successors when handling predicated replication."); 7528 // Record predicated instructions for above packing optimizations. 7529 PredInst2Recipe[I] = Recipe; 7530 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7531 VPBlockUtils::insertBlockAfter(Region, VPBB); 7532 auto *RegSucc = new VPBasicBlock(); 7533 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7534 return RegSucc; 7535 } 7536 7537 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7538 VPRecipeBase *PredRecipe, 7539 VPlanPtr &Plan) { 7540 // Instructions marked for predication are replicated and placed under an 7541 // if-then construct to prevent side-effects. 7542 7543 // Generate recipes to compute the block mask for this region. 7544 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7545 7546 // Build the triangular if-then region. 7547 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7548 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7549 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7550 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7551 auto *PHIRecipe = 7552 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7553 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7554 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7555 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7556 7557 // Note: first set Entry as region entry and then connect successors starting 7558 // from it in order, to propagate the "parent" of each VPBasicBlock. 7559 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7560 VPBlockUtils::connectBlocks(Pred, Exit); 7561 7562 return Region; 7563 } 7564 7565 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7566 VFRange &Range, 7567 VPlanPtr &Plan) { 7568 // First, check for specific widening recipes that deal with calls, memory 7569 // operations, inductions and Phi nodes. 7570 if (auto *CI = dyn_cast<CallInst>(Instr)) 7571 return tryToWidenCall(CI, Range, *Plan); 7572 7573 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7574 return tryToWidenMemory(Instr, Range, Plan); 7575 7576 VPRecipeBase *Recipe; 7577 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7578 if (Phi->getParent() != OrigLoop->getHeader()) 7579 return tryToBlend(Phi, Plan); 7580 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7581 return Recipe; 7582 return new VPWidenPHIRecipe(Phi); 7583 } 7584 7585 if (isa<TruncInst>(Instr) && 7586 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7587 return Recipe; 7588 7589 if (!shouldWiden(Instr, Range)) 7590 return nullptr; 7591 7592 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7593 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7594 OrigLoop); 7595 7596 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7597 bool InvariantCond = 7598 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7599 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7600 InvariantCond); 7601 } 7602 7603 return tryToWiden(Instr, *Plan); 7604 } 7605 7606 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7607 unsigned MaxVF) { 7608 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7609 7610 // Collect conditions feeding internal conditional branches; they need to be 7611 // represented in VPlan for it to model masking. 7612 SmallPtrSet<Value *, 1> NeedDef; 7613 7614 auto *Latch = OrigLoop->getLoopLatch(); 7615 for (BasicBlock *BB : OrigLoop->blocks()) { 7616 if (BB == Latch) 7617 continue; 7618 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7619 if (Branch && Branch->isConditional()) 7620 NeedDef.insert(Branch->getCondition()); 7621 } 7622 7623 // If the tail is to be folded by masking, the primary induction variable, if 7624 // exists needs to be represented in VPlan for it to model early-exit masking. 7625 // Also, both the Phi and the live-out instruction of each reduction are 7626 // required in order to introduce a select between them in VPlan. 7627 if (CM.foldTailByMasking()) { 7628 if (Legal->getPrimaryInduction()) 7629 NeedDef.insert(Legal->getPrimaryInduction()); 7630 for (auto &Reduction : Legal->getReductionVars()) { 7631 NeedDef.insert(Reduction.first); 7632 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7633 } 7634 } 7635 7636 // Collect instructions from the original loop that will become trivially dead 7637 // in the vectorized loop. We don't need to vectorize these instructions. For 7638 // example, original induction update instructions can become dead because we 7639 // separately emit induction "steps" when generating code for the new loop. 7640 // Similarly, we create a new latch condition when setting up the structure 7641 // of the new loop, so the old one can become dead. 7642 SmallPtrSet<Instruction *, 4> DeadInstructions; 7643 collectTriviallyDeadInstructions(DeadInstructions); 7644 7645 // Add assume instructions we need to drop to DeadInstructions, to prevent 7646 // them from being added to the VPlan. 7647 // TODO: We only need to drop assumes in blocks that get flattend. If the 7648 // control flow is preserved, we should keep them. 7649 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7650 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7651 7652 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7653 // Dead instructions do not need sinking. Remove them from SinkAfter. 7654 for (Instruction *I : DeadInstructions) 7655 SinkAfter.erase(I); 7656 7657 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7658 VFRange SubRange = {VF, MaxVF + 1}; 7659 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7660 DeadInstructions, SinkAfter)); 7661 VF = SubRange.End; 7662 } 7663 } 7664 7665 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7666 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7667 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7668 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7669 7670 // Hold a mapping from predicated instructions to their recipes, in order to 7671 // fix their AlsoPack behavior if a user is determined to replicate and use a 7672 // scalar instead of vector value. 7673 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7674 7675 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7676 7677 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7678 7679 // --------------------------------------------------------------------------- 7680 // Pre-construction: record ingredients whose recipes we'll need to further 7681 // process after constructing the initial VPlan. 7682 // --------------------------------------------------------------------------- 7683 7684 // Mark instructions we'll need to sink later and their targets as 7685 // ingredients whose recipe we'll need to record. 7686 for (auto &Entry : SinkAfter) { 7687 RecipeBuilder.recordRecipeOf(Entry.first); 7688 RecipeBuilder.recordRecipeOf(Entry.second); 7689 } 7690 for (auto &Reduction : CM.getInLoopReductionChains()) { 7691 PHINode *Phi = Reduction.first; 7692 RecurrenceDescriptor::RecurrenceKind Kind = 7693 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7694 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7695 7696 RecipeBuilder.recordRecipeOf(Phi); 7697 for (auto &R : ReductionOperations) { 7698 RecipeBuilder.recordRecipeOf(R); 7699 // For min/max reducitons, where we have a pair of icmp/select, we also 7700 // need to record the ICmp recipe, so it can be removed later. 7701 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7702 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7703 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7704 } 7705 } 7706 } 7707 7708 // For each interleave group which is relevant for this (possibly trimmed) 7709 // Range, add it to the set of groups to be later applied to the VPlan and add 7710 // placeholders for its members' Recipes which we'll be replacing with a 7711 // single VPInterleaveRecipe. 7712 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7713 auto applyIG = [IG, this](ElementCount VF) -> bool { 7714 return (VF.isVector() && // Query is illegal for VF == 1 7715 CM.getWideningDecision(IG->getInsertPos(), VF) == 7716 LoopVectorizationCostModel::CM_Interleave); 7717 }; 7718 if (!getDecisionAndClampRange(applyIG, Range)) 7719 continue; 7720 InterleaveGroups.insert(IG); 7721 for (unsigned i = 0; i < IG->getFactor(); i++) 7722 if (Instruction *Member = IG->getMember(i)) 7723 RecipeBuilder.recordRecipeOf(Member); 7724 }; 7725 7726 // --------------------------------------------------------------------------- 7727 // Build initial VPlan: Scan the body of the loop in a topological order to 7728 // visit each basic block after having visited its predecessor basic blocks. 7729 // --------------------------------------------------------------------------- 7730 7731 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7732 auto Plan = std::make_unique<VPlan>(); 7733 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7734 Plan->setEntry(VPBB); 7735 7736 // Represent values that will have defs inside VPlan. 7737 for (Value *V : NeedDef) 7738 Plan->addVPValue(V); 7739 7740 // Scan the body of the loop in a topological order to visit each basic block 7741 // after having visited its predecessor basic blocks. 7742 LoopBlocksDFS DFS(OrigLoop); 7743 DFS.perform(LI); 7744 7745 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7746 // Relevant instructions from basic block BB will be grouped into VPRecipe 7747 // ingredients and fill a new VPBasicBlock. 7748 unsigned VPBBsForBB = 0; 7749 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7750 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7751 VPBB = FirstVPBBForBB; 7752 Builder.setInsertPoint(VPBB); 7753 7754 // Introduce each ingredient into VPlan. 7755 // TODO: Model and preserve debug instrinsics in VPlan. 7756 for (Instruction &I : BB->instructionsWithoutDebug()) { 7757 Instruction *Instr = &I; 7758 7759 // First filter out irrelevant instructions, to ensure no recipes are 7760 // built for them. 7761 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7762 continue; 7763 7764 if (auto Recipe = 7765 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7766 RecipeBuilder.setRecipe(Instr, Recipe); 7767 VPBB->appendRecipe(Recipe); 7768 continue; 7769 } 7770 7771 // Otherwise, if all widening options failed, Instruction is to be 7772 // replicated. This may create a successor for VPBB. 7773 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7774 Instr, Range, VPBB, PredInst2Recipe, Plan); 7775 if (NextVPBB != VPBB) { 7776 VPBB = NextVPBB; 7777 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7778 : ""); 7779 } 7780 } 7781 } 7782 7783 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7784 // may also be empty, such as the last one VPBB, reflecting original 7785 // basic-blocks with no recipes. 7786 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7787 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7788 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7789 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7790 delete PreEntry; 7791 7792 // --------------------------------------------------------------------------- 7793 // Transform initial VPlan: Apply previously taken decisions, in order, to 7794 // bring the VPlan to its final state. 7795 // --------------------------------------------------------------------------- 7796 7797 // Apply Sink-After legal constraints. 7798 for (auto &Entry : SinkAfter) { 7799 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7800 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7801 Sink->moveAfter(Target); 7802 } 7803 7804 // Interleave memory: for each Interleave Group we marked earlier as relevant 7805 // for this VPlan, replace the Recipes widening its memory instructions with a 7806 // single VPInterleaveRecipe at its insertion point. 7807 for (auto IG : InterleaveGroups) { 7808 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7809 RecipeBuilder.getRecipe(IG->getInsertPos())); 7810 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7811 ->insertBefore(Recipe); 7812 7813 for (unsigned i = 0; i < IG->getFactor(); ++i) 7814 if (Instruction *Member = IG->getMember(i)) { 7815 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7816 } 7817 } 7818 7819 // Adjust the recipes for any inloop reductions. 7820 if (Range.Start > 1) 7821 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7822 7823 // Finally, if tail is folded by masking, introduce selects between the phi 7824 // and the live-out instruction of each reduction, at the end of the latch. 7825 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7826 Builder.setInsertPoint(VPBB); 7827 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7828 for (auto &Reduction : Legal->getReductionVars()) { 7829 if (CM.isInLoopReduction(Reduction.first)) 7830 continue; 7831 VPValue *Phi = Plan->getVPValue(Reduction.first); 7832 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7833 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7834 } 7835 } 7836 7837 std::string PlanName; 7838 raw_string_ostream RSO(PlanName); 7839 ElementCount VF = ElementCount::getFixed(Range.Start); 7840 Plan->addVF(VF); 7841 RSO << "Initial VPlan for VF={" << VF; 7842 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7843 Plan->addVF(VF); 7844 RSO << "," << VF; 7845 } 7846 RSO << "},UF>=1"; 7847 RSO.flush(); 7848 Plan->setName(PlanName); 7849 7850 return Plan; 7851 } 7852 7853 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7854 // Outer loop handling: They may require CFG and instruction level 7855 // transformations before even evaluating whether vectorization is profitable. 7856 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7857 // the vectorization pipeline. 7858 assert(!OrigLoop->isInnermost()); 7859 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7860 7861 // Create new empty VPlan 7862 auto Plan = std::make_unique<VPlan>(); 7863 7864 // Build hierarchical CFG 7865 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7866 HCFGBuilder.buildHierarchicalCFG(); 7867 7868 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7869 Plan->addVF(ElementCount::getFixed(VF)); 7870 7871 if (EnableVPlanPredication) { 7872 VPlanPredicator VPP(*Plan); 7873 VPP.predicate(); 7874 7875 // Avoid running transformation to recipes until masked code generation in 7876 // VPlan-native path is in place. 7877 return Plan; 7878 } 7879 7880 SmallPtrSet<Instruction *, 1> DeadInstructions; 7881 VPlanTransforms::VPInstructionsToVPRecipes( 7882 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7883 return Plan; 7884 } 7885 7886 // Adjust the recipes for any inloop reductions. The chain of instructions 7887 // leading from the loop exit instr to the phi need to be converted to 7888 // reductions, with one operand being vector and the other being the scalar 7889 // reduction chain. 7890 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7891 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7892 for (auto &Reduction : CM.getInLoopReductionChains()) { 7893 PHINode *Phi = Reduction.first; 7894 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7895 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7896 7897 // ReductionOperations are orders top-down from the phi's use to the 7898 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7899 // which of the two operands will remain scalar and which will be reduced. 7900 // For minmax the chain will be the select instructions. 7901 Instruction *Chain = Phi; 7902 for (Instruction *R : ReductionOperations) { 7903 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7904 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7905 7906 VPValue *ChainOp = Plan->getVPValue(Chain); 7907 unsigned FirstOpId; 7908 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7909 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7910 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 7911 "Expected to replace a VPWidenSelectSC"); 7912 FirstOpId = 1; 7913 } else { 7914 assert(isa<VPWidenRecipe>(WidenRecipe) && 7915 "Expected to replace a VPWidenSC"); 7916 FirstOpId = 0; 7917 } 7918 unsigned VecOpId = 7919 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7920 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7921 7922 auto *CondOp = CM.foldTailByMasking() 7923 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 7924 : nullptr; 7925 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7926 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 7927 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7928 WidenRecipe->eraseFromParent(); 7929 7930 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7931 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7932 VPRecipeBase *CompareRecipe = 7933 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7934 assert(isa<VPWidenRecipe>(CompareRecipe) && 7935 "Expected to replace a VPWidenSC"); 7936 CompareRecipe->eraseFromParent(); 7937 } 7938 Chain = R; 7939 } 7940 } 7941 } 7942 7943 Value* LoopVectorizationPlanner::VPCallbackILV:: 7944 getOrCreateVectorValues(Value *V, unsigned Part) { 7945 return ILV.getOrCreateVectorValue(V, Part); 7946 } 7947 7948 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7949 Value *V, const VPIteration &Instance) { 7950 return ILV.getOrCreateScalarValue(V, Instance); 7951 } 7952 7953 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7954 VPSlotTracker &SlotTracker) const { 7955 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7956 IG->getInsertPos()->printAsOperand(O, false); 7957 O << ", "; 7958 getAddr()->printAsOperand(O, SlotTracker); 7959 VPValue *Mask = getMask(); 7960 if (Mask) { 7961 O << ", "; 7962 Mask->printAsOperand(O, SlotTracker); 7963 } 7964 for (unsigned i = 0; i < IG->getFactor(); ++i) 7965 if (Instruction *I = IG->getMember(i)) 7966 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7967 } 7968 7969 void VPWidenCallRecipe::execute(VPTransformState &State) { 7970 State.ILV->widenCallInstruction(Ingredient, *this, State); 7971 } 7972 7973 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7974 State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State); 7975 } 7976 7977 void VPWidenRecipe::execute(VPTransformState &State) { 7978 State.ILV->widenInstruction(Ingredient, *this, State); 7979 } 7980 7981 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7982 State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant, 7983 IsIndexLoopInvariant, State); 7984 } 7985 7986 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7987 assert(!State.Instance && "Int or FP induction being replicated."); 7988 State.ILV->widenIntOrFpInduction(IV, Trunc); 7989 } 7990 7991 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7992 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7993 } 7994 7995 void VPBlendRecipe::execute(VPTransformState &State) { 7996 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7997 // We know that all PHIs in non-header blocks are converted into 7998 // selects, so we don't have to worry about the insertion order and we 7999 // can just use the builder. 8000 // At this point we generate the predication tree. There may be 8001 // duplications since this is a simple recursive scan, but future 8002 // optimizations will clean it up. 8003 8004 unsigned NumIncoming = getNumIncomingValues(); 8005 8006 // Generate a sequence of selects of the form: 8007 // SELECT(Mask3, In3, 8008 // SELECT(Mask2, In2, 8009 // SELECT(Mask1, In1, 8010 // In0))) 8011 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8012 // are essentially undef are taken from In0. 8013 InnerLoopVectorizer::VectorParts Entry(State.UF); 8014 for (unsigned In = 0; In < NumIncoming; ++In) { 8015 for (unsigned Part = 0; Part < State.UF; ++Part) { 8016 // We might have single edge PHIs (blocks) - use an identity 8017 // 'select' for the first PHI operand. 8018 Value *In0 = State.get(getIncomingValue(In), Part); 8019 if (In == 0) 8020 Entry[Part] = In0; // Initialize with the first incoming value. 8021 else { 8022 // Select between the current value and the previous incoming edge 8023 // based on the incoming mask. 8024 Value *Cond = State.get(getMask(In), Part); 8025 Entry[Part] = 8026 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8027 } 8028 } 8029 } 8030 for (unsigned Part = 0; Part < State.UF; ++Part) 8031 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8032 } 8033 8034 void VPInterleaveRecipe::execute(VPTransformState &State) { 8035 assert(!State.Instance && "Interleave group being replicated."); 8036 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 8037 } 8038 8039 void VPReductionRecipe::execute(VPTransformState &State) { 8040 assert(!State.Instance && "Reduction being replicated."); 8041 for (unsigned Part = 0; Part < State.UF; ++Part) { 8042 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8043 Value *NewVecOp = State.get(VecOp, Part); 8044 if (CondOp) { 8045 Value *NewCond = State.get(CondOp, Part); 8046 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8047 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8048 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8049 Constant *IdenVec = 8050 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8051 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8052 NewVecOp = Select; 8053 } 8054 Value *NewRed = 8055 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8056 Value *PrevInChain = State.get(ChainOp, Part); 8057 Value *NextInChain; 8058 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8059 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8060 NextInChain = 8061 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8062 NewRed, PrevInChain); 8063 } else { 8064 NextInChain = State.Builder.CreateBinOp( 8065 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8066 } 8067 State.ValueMap.setVectorValue(I, Part, NextInChain); 8068 } 8069 } 8070 8071 void VPReplicateRecipe::execute(VPTransformState &State) { 8072 if (State.Instance) { // Generate a single instance. 8073 State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance, 8074 IsPredicated, State); 8075 // Insert scalar instance packing it into a vector. 8076 if (AlsoPack && State.VF.isVector()) { 8077 // If we're constructing lane 0, initialize to start from undef. 8078 if (State.Instance->Lane == 0) { 8079 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8080 Value *Undef = 8081 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8082 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8083 } 8084 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8085 } 8086 return; 8087 } 8088 8089 // Generate scalar instances for all VF lanes of all UF parts, unless the 8090 // instruction is uniform inwhich case generate only the first lane for each 8091 // of the UF parts. 8092 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8093 for (unsigned Part = 0; Part < State.UF; ++Part) 8094 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8095 State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane}, 8096 IsPredicated, State); 8097 } 8098 8099 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8100 assert(State.Instance && "Branch on Mask works only on single instance."); 8101 8102 unsigned Part = State.Instance->Part; 8103 unsigned Lane = State.Instance->Lane; 8104 8105 Value *ConditionBit = nullptr; 8106 VPValue *BlockInMask = getMask(); 8107 if (BlockInMask) { 8108 ConditionBit = State.get(BlockInMask, Part); 8109 if (ConditionBit->getType()->isVectorTy()) 8110 ConditionBit = State.Builder.CreateExtractElement( 8111 ConditionBit, State.Builder.getInt32(Lane)); 8112 } else // Block in mask is all-one. 8113 ConditionBit = State.Builder.getTrue(); 8114 8115 // Replace the temporary unreachable terminator with a new conditional branch, 8116 // whose two destinations will be set later when they are created. 8117 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8118 assert(isa<UnreachableInst>(CurrentTerminator) && 8119 "Expected to replace unreachable terminator with conditional branch."); 8120 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8121 CondBr->setSuccessor(0, nullptr); 8122 ReplaceInstWithInst(CurrentTerminator, CondBr); 8123 } 8124 8125 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8126 assert(State.Instance && "Predicated instruction PHI works per instance."); 8127 Instruction *ScalarPredInst = cast<Instruction>( 8128 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8129 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8130 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8131 assert(PredicatingBB && "Predicated block has no single predecessor."); 8132 8133 // By current pack/unpack logic we need to generate only a single phi node: if 8134 // a vector value for the predicated instruction exists at this point it means 8135 // the instruction has vector users only, and a phi for the vector value is 8136 // needed. In this case the recipe of the predicated instruction is marked to 8137 // also do that packing, thereby "hoisting" the insert-element sequence. 8138 // Otherwise, a phi node for the scalar value is needed. 8139 unsigned Part = State.Instance->Part; 8140 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8141 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8142 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8143 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8144 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8145 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8146 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8147 } else { 8148 Type *PredInstType = PredInst->getType(); 8149 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8150 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8151 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8152 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8153 } 8154 } 8155 8156 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8157 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8158 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8159 getMask()); 8160 } 8161 8162 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8163 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8164 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8165 // for predication. 8166 static ScalarEpilogueLowering getScalarEpilogueLowering( 8167 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8168 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8169 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8170 LoopVectorizationLegality &LVL) { 8171 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8172 // don't look at hints or options, and don't request a scalar epilogue. 8173 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8174 // LoopAccessInfo (due to code dependency and not being able to reliably get 8175 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8176 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8177 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8178 // back to the old way and vectorize with versioning when forced. See D81345.) 8179 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8180 PGSOQueryType::IRPass) && 8181 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8182 return CM_ScalarEpilogueNotAllowedOptSize; 8183 8184 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8185 !PreferPredicateOverEpilogue; 8186 8187 // 2) Next, if disabling predication is requested on the command line, honour 8188 // this and request a scalar epilogue. 8189 if (PredicateOptDisabled) 8190 return CM_ScalarEpilogueAllowed; 8191 8192 // 3) and 4) look if enabling predication is requested on the command line, 8193 // with a loop hint, or if the TTI hook indicates this is profitable, request 8194 // predication. 8195 if (PreferPredicateOverEpilogue || 8196 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8197 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8198 LVL.getLAI()) && 8199 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8200 return CM_ScalarEpilogueNotNeededUsePredicate; 8201 8202 return CM_ScalarEpilogueAllowed; 8203 } 8204 8205 // Process the loop in the VPlan-native vectorization path. This path builds 8206 // VPlan upfront in the vectorization pipeline, which allows to apply 8207 // VPlan-to-VPlan transformations from the very beginning without modifying the 8208 // input LLVM IR. 8209 static bool processLoopInVPlanNativePath( 8210 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8211 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8212 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8213 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8214 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8215 8216 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8217 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8218 return false; 8219 } 8220 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8221 Function *F = L->getHeader()->getParent(); 8222 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8223 8224 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8225 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8226 8227 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8228 &Hints, IAI); 8229 // Use the planner for outer loop vectorization. 8230 // TODO: CM is not used at this point inside the planner. Turn CM into an 8231 // optional argument if we don't need it in the future. 8232 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8233 8234 // Get user vectorization factor. 8235 const unsigned UserVF = Hints.getWidth(); 8236 8237 // Plan how to best vectorize, return the best VF and its cost. 8238 const VectorizationFactor VF = 8239 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8240 8241 // If we are stress testing VPlan builds, do not attempt to generate vector 8242 // code. Masked vector code generation support will follow soon. 8243 // Also, do not attempt to vectorize if no vector code will be produced. 8244 if (VPlanBuildStressTest || EnableVPlanPredication || 8245 VectorizationFactor::Disabled() == VF) 8246 return false; 8247 8248 LVP.setBestPlan(VF.Width, 1); 8249 8250 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8251 &CM, BFI, PSI); 8252 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8253 << L->getHeader()->getParent()->getName() << "\"\n"); 8254 LVP.executePlan(LB, DT); 8255 8256 // Mark the loop as already vectorized to avoid vectorizing again. 8257 Hints.setAlreadyVectorized(); 8258 8259 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8260 return true; 8261 } 8262 8263 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8264 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8265 !EnableLoopInterleaving), 8266 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8267 !EnableLoopVectorization) {} 8268 8269 bool LoopVectorizePass::processLoop(Loop *L) { 8270 assert((EnableVPlanNativePath || L->isInnermost()) && 8271 "VPlan-native path is not enabled. Only process inner loops."); 8272 8273 #ifndef NDEBUG 8274 const std::string DebugLocStr = getDebugLocString(L); 8275 #endif /* NDEBUG */ 8276 8277 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8278 << L->getHeader()->getParent()->getName() << "\" from " 8279 << DebugLocStr << "\n"); 8280 8281 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8282 8283 LLVM_DEBUG( 8284 dbgs() << "LV: Loop hints:" 8285 << " force=" 8286 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8287 ? "disabled" 8288 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8289 ? "enabled" 8290 : "?")) 8291 << " width=" << Hints.getWidth() 8292 << " unroll=" << Hints.getInterleave() << "\n"); 8293 8294 // Function containing loop 8295 Function *F = L->getHeader()->getParent(); 8296 8297 // Looking at the diagnostic output is the only way to determine if a loop 8298 // was vectorized (other than looking at the IR or machine code), so it 8299 // is important to generate an optimization remark for each loop. Most of 8300 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8301 // generated as OptimizationRemark and OptimizationRemarkMissed are 8302 // less verbose reporting vectorized loops and unvectorized loops that may 8303 // benefit from vectorization, respectively. 8304 8305 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8306 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8307 return false; 8308 } 8309 8310 PredicatedScalarEvolution PSE(*SE, *L); 8311 8312 // Check if it is legal to vectorize the loop. 8313 LoopVectorizationRequirements Requirements(*ORE); 8314 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8315 &Requirements, &Hints, DB, AC, BFI, PSI); 8316 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8317 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8318 Hints.emitRemarkWithHints(); 8319 return false; 8320 } 8321 8322 // Check the function attributes and profiles to find out if this function 8323 // should be optimized for size. 8324 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8325 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8326 8327 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8328 // here. They may require CFG and instruction level transformations before 8329 // even evaluating whether vectorization is profitable. Since we cannot modify 8330 // the incoming IR, we need to build VPlan upfront in the vectorization 8331 // pipeline. 8332 if (!L->isInnermost()) 8333 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8334 ORE, BFI, PSI, Hints); 8335 8336 assert(L->isInnermost() && "Inner loop expected."); 8337 8338 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8339 // count by optimizing for size, to minimize overheads. 8340 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8341 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8342 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8343 << "This loop is worth vectorizing only if no scalar " 8344 << "iteration overheads are incurred."); 8345 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8346 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8347 else { 8348 LLVM_DEBUG(dbgs() << "\n"); 8349 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8350 } 8351 } 8352 8353 // Check the function attributes to see if implicit floats are allowed. 8354 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8355 // an integer loop and the vector instructions selected are purely integer 8356 // vector instructions? 8357 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8358 reportVectorizationFailure( 8359 "Can't vectorize when the NoImplicitFloat attribute is used", 8360 "loop not vectorized due to NoImplicitFloat attribute", 8361 "NoImplicitFloat", ORE, L); 8362 Hints.emitRemarkWithHints(); 8363 return false; 8364 } 8365 8366 // Check if the target supports potentially unsafe FP vectorization. 8367 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8368 // for the target we're vectorizing for, to make sure none of the 8369 // additional fp-math flags can help. 8370 if (Hints.isPotentiallyUnsafe() && 8371 TTI->isFPVectorizationPotentiallyUnsafe()) { 8372 reportVectorizationFailure( 8373 "Potentially unsafe FP op prevents vectorization", 8374 "loop not vectorized due to unsafe FP support.", 8375 "UnsafeFP", ORE, L); 8376 Hints.emitRemarkWithHints(); 8377 return false; 8378 } 8379 8380 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8381 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8382 8383 // If an override option has been passed in for interleaved accesses, use it. 8384 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8385 UseInterleaved = EnableInterleavedMemAccesses; 8386 8387 // Analyze interleaved memory accesses. 8388 if (UseInterleaved) { 8389 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8390 } 8391 8392 // Use the cost model. 8393 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8394 F, &Hints, IAI); 8395 CM.collectValuesToIgnore(); 8396 8397 // Use the planner for vectorization. 8398 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8399 8400 // Get user vectorization factor and interleave count. 8401 unsigned UserVF = Hints.getWidth(); 8402 unsigned UserIC = Hints.getInterleave(); 8403 8404 // Plan how to best vectorize, return the best VF and its cost. 8405 Optional<VectorizationFactor> MaybeVF = 8406 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8407 8408 VectorizationFactor VF = VectorizationFactor::Disabled(); 8409 unsigned IC = 1; 8410 8411 if (MaybeVF) { 8412 VF = *MaybeVF; 8413 // Select the interleave count. 8414 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8415 } 8416 8417 // Identify the diagnostic messages that should be produced. 8418 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8419 bool VectorizeLoop = true, InterleaveLoop = true; 8420 if (Requirements.doesNotMeet(F, L, Hints)) { 8421 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8422 "requirements.\n"); 8423 Hints.emitRemarkWithHints(); 8424 return false; 8425 } 8426 8427 if (VF.Width.isScalar()) { 8428 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8429 VecDiagMsg = std::make_pair( 8430 "VectorizationNotBeneficial", 8431 "the cost-model indicates that vectorization is not beneficial"); 8432 VectorizeLoop = false; 8433 } 8434 8435 if (!MaybeVF && UserIC > 1) { 8436 // Tell the user interleaving was avoided up-front, despite being explicitly 8437 // requested. 8438 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8439 "interleaving should be avoided up front\n"); 8440 IntDiagMsg = std::make_pair( 8441 "InterleavingAvoided", 8442 "Ignoring UserIC, because interleaving was avoided up front"); 8443 InterleaveLoop = false; 8444 } else if (IC == 1 && UserIC <= 1) { 8445 // Tell the user interleaving is not beneficial. 8446 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8447 IntDiagMsg = std::make_pair( 8448 "InterleavingNotBeneficial", 8449 "the cost-model indicates that interleaving is not beneficial"); 8450 InterleaveLoop = false; 8451 if (UserIC == 1) { 8452 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8453 IntDiagMsg.second += 8454 " and is explicitly disabled or interleave count is set to 1"; 8455 } 8456 } else if (IC > 1 && UserIC == 1) { 8457 // Tell the user interleaving is beneficial, but it explicitly disabled. 8458 LLVM_DEBUG( 8459 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8460 IntDiagMsg = std::make_pair( 8461 "InterleavingBeneficialButDisabled", 8462 "the cost-model indicates that interleaving is beneficial " 8463 "but is explicitly disabled or interleave count is set to 1"); 8464 InterleaveLoop = false; 8465 } 8466 8467 // Override IC if user provided an interleave count. 8468 IC = UserIC > 0 ? UserIC : IC; 8469 8470 // Emit diagnostic messages, if any. 8471 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8472 if (!VectorizeLoop && !InterleaveLoop) { 8473 // Do not vectorize or interleaving the loop. 8474 ORE->emit([&]() { 8475 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8476 L->getStartLoc(), L->getHeader()) 8477 << VecDiagMsg.second; 8478 }); 8479 ORE->emit([&]() { 8480 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8481 L->getStartLoc(), L->getHeader()) 8482 << IntDiagMsg.second; 8483 }); 8484 return false; 8485 } else if (!VectorizeLoop && InterleaveLoop) { 8486 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8487 ORE->emit([&]() { 8488 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8489 L->getStartLoc(), L->getHeader()) 8490 << VecDiagMsg.second; 8491 }); 8492 } else if (VectorizeLoop && !InterleaveLoop) { 8493 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8494 << ") in " << DebugLocStr << '\n'); 8495 ORE->emit([&]() { 8496 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8497 L->getStartLoc(), L->getHeader()) 8498 << IntDiagMsg.second; 8499 }); 8500 } else if (VectorizeLoop && InterleaveLoop) { 8501 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8502 << ") in " << DebugLocStr << '\n'); 8503 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8504 } 8505 8506 LVP.setBestPlan(VF.Width, IC); 8507 8508 using namespace ore; 8509 bool DisableRuntimeUnroll = false; 8510 MDNode *OrigLoopID = L->getLoopID(); 8511 8512 if (!VectorizeLoop) { 8513 assert(IC > 1 && "interleave count should not be 1 or 0"); 8514 // If we decided that it is not legal to vectorize the loop, then 8515 // interleave it. 8516 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8517 BFI, PSI); 8518 LVP.executePlan(Unroller, DT); 8519 8520 ORE->emit([&]() { 8521 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8522 L->getHeader()) 8523 << "interleaved loop (interleaved count: " 8524 << NV("InterleaveCount", IC) << ")"; 8525 }); 8526 } else { 8527 // If we decided that it is *legal* to vectorize the loop, then do it. 8528 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8529 &LVL, &CM, BFI, PSI); 8530 LVP.executePlan(LB, DT); 8531 ++LoopsVectorized; 8532 8533 // Add metadata to disable runtime unrolling a scalar loop when there are 8534 // no runtime checks about strides and memory. A scalar loop that is 8535 // rarely used is not worth unrolling. 8536 if (!LB.areSafetyChecksAdded()) 8537 DisableRuntimeUnroll = true; 8538 8539 // Report the vectorization decision. 8540 ORE->emit([&]() { 8541 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8542 L->getHeader()) 8543 << "vectorized loop (vectorization width: " 8544 << NV("VectorizationFactor", VF.Width) 8545 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8546 }); 8547 } 8548 8549 Optional<MDNode *> RemainderLoopID = 8550 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8551 LLVMLoopVectorizeFollowupEpilogue}); 8552 if (RemainderLoopID.hasValue()) { 8553 L->setLoopID(RemainderLoopID.getValue()); 8554 } else { 8555 if (DisableRuntimeUnroll) 8556 AddRuntimeUnrollDisableMetaData(L); 8557 8558 // Mark the loop as already vectorized to avoid vectorizing again. 8559 Hints.setAlreadyVectorized(); 8560 } 8561 8562 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8563 return true; 8564 } 8565 8566 LoopVectorizeResult LoopVectorizePass::runImpl( 8567 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8568 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8569 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8570 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8571 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8572 SE = &SE_; 8573 LI = &LI_; 8574 TTI = &TTI_; 8575 DT = &DT_; 8576 BFI = &BFI_; 8577 TLI = TLI_; 8578 AA = &AA_; 8579 AC = &AC_; 8580 GetLAA = &GetLAA_; 8581 DB = &DB_; 8582 ORE = &ORE_; 8583 PSI = PSI_; 8584 8585 // Don't attempt if 8586 // 1. the target claims to have no vector registers, and 8587 // 2. interleaving won't help ILP. 8588 // 8589 // The second condition is necessary because, even if the target has no 8590 // vector registers, loop vectorization may still enable scalar 8591 // interleaving. 8592 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8593 TTI->getMaxInterleaveFactor(1) < 2) 8594 return LoopVectorizeResult(false, false); 8595 8596 bool Changed = false, CFGChanged = false; 8597 8598 // The vectorizer requires loops to be in simplified form. 8599 // Since simplification may add new inner loops, it has to run before the 8600 // legality and profitability checks. This means running the loop vectorizer 8601 // will simplify all loops, regardless of whether anything end up being 8602 // vectorized. 8603 for (auto &L : *LI) 8604 Changed |= CFGChanged |= 8605 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8606 8607 // Build up a worklist of inner-loops to vectorize. This is necessary as 8608 // the act of vectorizing or partially unrolling a loop creates new loops 8609 // and can invalidate iterators across the loops. 8610 SmallVector<Loop *, 8> Worklist; 8611 8612 for (Loop *L : *LI) 8613 collectSupportedLoops(*L, LI, ORE, Worklist); 8614 8615 LoopsAnalyzed += Worklist.size(); 8616 8617 // Now walk the identified inner loops. 8618 while (!Worklist.empty()) { 8619 Loop *L = Worklist.pop_back_val(); 8620 8621 // For the inner loops we actually process, form LCSSA to simplify the 8622 // transform. 8623 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8624 8625 Changed |= CFGChanged |= processLoop(L); 8626 } 8627 8628 // Process each loop nest in the function. 8629 return LoopVectorizeResult(Changed, CFGChanged); 8630 } 8631 8632 PreservedAnalyses LoopVectorizePass::run(Function &F, 8633 FunctionAnalysisManager &AM) { 8634 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8635 auto &LI = AM.getResult<LoopAnalysis>(F); 8636 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8637 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8638 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8639 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8640 auto &AA = AM.getResult<AAManager>(F); 8641 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8642 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8643 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8644 MemorySSA *MSSA = EnableMSSALoopDependency 8645 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8646 : nullptr; 8647 8648 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8649 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8650 [&](Loop &L) -> const LoopAccessInfo & { 8651 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 8652 TLI, TTI, nullptr, MSSA}; 8653 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8654 }; 8655 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8656 ProfileSummaryInfo *PSI = 8657 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8658 LoopVectorizeResult Result = 8659 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8660 if (!Result.MadeAnyChange) 8661 return PreservedAnalyses::all(); 8662 PreservedAnalyses PA; 8663 8664 // We currently do not preserve loopinfo/dominator analyses with outer loop 8665 // vectorization. Until this is addressed, mark these analyses as preserved 8666 // only for non-VPlan-native path. 8667 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8668 if (!EnableVPlanNativePath) { 8669 PA.preserve<LoopAnalysis>(); 8670 PA.preserve<DominatorTreeAnalysis>(); 8671 } 8672 PA.preserve<BasicAA>(); 8673 PA.preserve<GlobalsAA>(); 8674 if (!Result.MadeCFGChange) 8675 PA.preserveSet<CFGAnalyses>(); 8676 return PA; 8677 } 8678