1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 182 // that predication is preferred, and this lists all options. I.e., the 183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 184 // and predicate the instructions accordingly. If tail-folding fails, there are 185 // different fallback strategies depending on these values: 186 namespace PreferPredicateTy { 187 enum Option { 188 ScalarEpilogue = 0, 189 PredicateElseScalarEpilogue, 190 PredicateOrDontVectorize 191 }; 192 } 193 194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 195 "prefer-predicate-over-epilogue", 196 cl::init(PreferPredicateTy::ScalarEpilogue), 197 cl::Hidden, 198 cl::desc("Tail-folding and predication preferences over creating a scalar " 199 "epilogue loop."), 200 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 201 "scalar-epilogue", 202 "Don't tail-predicate loops, create scalar epilogue"), 203 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 204 "predicate-else-scalar-epilogue", 205 "prefer tail-folding, create scalar epilogue if tail " 206 "folding fails."), 207 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 208 "predicate-dont-vectorize", 209 "prefers tail-folding, don't attempt vectorization if " 210 "tail-folding fails."))); 211 212 static cl::opt<bool> MaximizeBandwidth( 213 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 214 cl::desc("Maximize bandwidth when selecting vectorization factor which " 215 "will be determined by the smallest type in loop.")); 216 217 static cl::opt<bool> EnableInterleavedMemAccesses( 218 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 219 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 220 221 /// An interleave-group may need masking if it resides in a block that needs 222 /// predication, or in order to mask away gaps. 223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 224 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 225 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 226 227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 228 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 229 cl::desc("We don't interleave loops with a estimated constant trip count " 230 "below this number")); 231 232 static cl::opt<unsigned> ForceTargetNumScalarRegs( 233 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 234 cl::desc("A flag that overrides the target's number of scalar registers.")); 235 236 static cl::opt<unsigned> ForceTargetNumVectorRegs( 237 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 238 cl::desc("A flag that overrides the target's number of vector registers.")); 239 240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 241 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 242 cl::desc("A flag that overrides the target's max interleave factor for " 243 "scalar loops.")); 244 245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 246 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 247 cl::desc("A flag that overrides the target's max interleave factor for " 248 "vectorized loops.")); 249 250 static cl::opt<unsigned> ForceTargetInstructionCost( 251 "force-target-instruction-cost", cl::init(0), cl::Hidden, 252 cl::desc("A flag that overrides the target's expected cost for " 253 "an instruction to a single constant value. Mostly " 254 "useful for getting consistent testing.")); 255 256 static cl::opt<unsigned> SmallLoopCost( 257 "small-loop-cost", cl::init(20), cl::Hidden, 258 cl::desc( 259 "The cost of a loop that is considered 'small' by the interleaver.")); 260 261 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 262 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 263 cl::desc("Enable the use of the block frequency analysis to access PGO " 264 "heuristics minimizing code growth in cold regions and being more " 265 "aggressive in hot regions.")); 266 267 // Runtime interleave loops for load/store throughput. 268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 269 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 270 cl::desc( 271 "Enable runtime interleaving until load/store ports are saturated")); 272 273 /// The number of stores in a loop that are allowed to need predication. 274 static cl::opt<unsigned> NumberOfStoresToPredicate( 275 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 276 cl::desc("Max number of stores to be predicated behind an if.")); 277 278 static cl::opt<bool> EnableIndVarRegisterHeur( 279 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 280 cl::desc("Count the induction variable only once when interleaving")); 281 282 static cl::opt<bool> EnableCondStoresVectorization( 283 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 284 cl::desc("Enable if predication of stores during vectorization.")); 285 286 static cl::opt<unsigned> MaxNestedScalarReductionIC( 287 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 288 cl::desc("The maximum interleave count to use when interleaving a scalar " 289 "reduction in a nested loop.")); 290 291 static cl::opt<bool> 292 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 293 cl::Hidden, 294 cl::desc("Prefer in-loop vector reductions, " 295 "overriding the targets preference.")); 296 297 static cl::opt<bool> PreferPredicatedReductionSelect( 298 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 299 cl::desc( 300 "Prefer predicating a reduction operation over an after loop select.")); 301 302 cl::opt<bool> EnableVPlanNativePath( 303 "enable-vplan-native-path", cl::init(false), cl::Hidden, 304 cl::desc("Enable VPlan-native vectorization path with " 305 "support for outer loop vectorization.")); 306 307 // FIXME: Remove this switch once we have divergence analysis. Currently we 308 // assume divergent non-backedge branches when this switch is true. 309 cl::opt<bool> EnableVPlanPredication( 310 "enable-vplan-predication", cl::init(false), cl::Hidden, 311 cl::desc("Enable VPlan-native vectorization path predicator with " 312 "support for outer loop vectorization.")); 313 314 // This flag enables the stress testing of the VPlan H-CFG construction in the 315 // VPlan-native vectorization path. It must be used in conjuction with 316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 317 // verification of the H-CFGs built. 318 static cl::opt<bool> VPlanBuildStressTest( 319 "vplan-build-stress-test", cl::init(false), cl::Hidden, 320 cl::desc( 321 "Build VPlan for every supported loop nest in the function and bail " 322 "out right after the build (stress test the VPlan H-CFG construction " 323 "in the VPlan-native vectorization path).")); 324 325 cl::opt<bool> llvm::EnableLoopInterleaving( 326 "interleave-loops", cl::init(true), cl::Hidden, 327 cl::desc("Enable loop interleaving in Loop vectorization passes")); 328 cl::opt<bool> llvm::EnableLoopVectorization( 329 "vectorize-loops", cl::init(true), cl::Hidden, 330 cl::desc("Run the Loop vectorization passes")); 331 332 /// A helper function that returns the type of loaded or stored value. 333 static Type *getMemInstValueType(Value *I) { 334 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 335 "Expected Load or Store instruction"); 336 if (auto *LI = dyn_cast<LoadInst>(I)) 337 return LI->getType(); 338 return cast<StoreInst>(I)->getValueOperand()->getType(); 339 } 340 341 /// A helper function that returns true if the given type is irregular. The 342 /// type is irregular if its allocated size doesn't equal the store size of an 343 /// element of the corresponding vector type at the given vectorization factor. 344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 345 assert(!VF.isScalable() && "scalable vectors not yet supported."); 346 // Determine if an array of VF elements of type Ty is "bitcast compatible" 347 // with a <VF x Ty> vector. 348 if (VF.isVector()) { 349 auto *VectorTy = VectorType::get(Ty, VF); 350 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 351 } 352 353 // If the vectorization factor is one, we just check if an array of type Ty 354 // requires padding between elements. 355 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 356 } 357 358 /// A helper function that returns the reciprocal of the block probability of 359 /// predicated blocks. If we return X, we are assuming the predicated block 360 /// will execute once for every X iterations of the loop header. 361 /// 362 /// TODO: We should use actual block probability here, if available. Currently, 363 /// we always assume predicated blocks have a 50% chance of executing. 364 static unsigned getReciprocalPredBlockProb() { return 2; } 365 366 /// A helper function that adds a 'fast' flag to floating-point operations. 367 static Value *addFastMathFlag(Value *V) { 368 if (isa<FPMathOperator>(V)) 369 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 370 return V; 371 } 372 373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 374 if (isa<FPMathOperator>(V)) 375 cast<Instruction>(V)->setFastMathFlags(FMF); 376 return V; 377 } 378 379 /// A helper function that returns an integer or floating-point constant with 380 /// value C. 381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 382 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 383 : ConstantFP::get(Ty, C); 384 } 385 386 /// Returns "best known" trip count for the specified loop \p L as defined by 387 /// the following procedure: 388 /// 1) Returns exact trip count if it is known. 389 /// 2) Returns expected trip count according to profile data if any. 390 /// 3) Returns upper bound estimate if it is known. 391 /// 4) Returns None if all of the above failed. 392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 393 // Check if exact trip count is known. 394 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 395 return ExpectedTC; 396 397 // Check if there is an expected trip count available from profile data. 398 if (LoopVectorizeWithBlockFrequency) 399 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 400 return EstimatedTC; 401 402 // Check if upper bound estimate is known. 403 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 404 return ExpectedTC; 405 406 return None; 407 } 408 409 namespace llvm { 410 411 /// InnerLoopVectorizer vectorizes loops which contain only one basic 412 /// block to a specified vectorization factor (VF). 413 /// This class performs the widening of scalars into vectors, or multiple 414 /// scalars. This class also implements the following features: 415 /// * It inserts an epilogue loop for handling loops that don't have iteration 416 /// counts that are known to be a multiple of the vectorization factor. 417 /// * It handles the code generation for reduction variables. 418 /// * Scalarization (implementation using scalars) of un-vectorizable 419 /// instructions. 420 /// InnerLoopVectorizer does not perform any vectorization-legality 421 /// checks, and relies on the caller to check for the different legality 422 /// aspects. The InnerLoopVectorizer relies on the 423 /// LoopVectorizationLegality class to provide information about the induction 424 /// and reduction variables that were found to a given vectorization factor. 425 class InnerLoopVectorizer { 426 public: 427 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 428 LoopInfo *LI, DominatorTree *DT, 429 const TargetLibraryInfo *TLI, 430 const TargetTransformInfo *TTI, AssumptionCache *AC, 431 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 432 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 433 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 434 ProfileSummaryInfo *PSI) 435 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 436 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 437 Builder(PSE.getSE()->getContext()), 438 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 439 BFI(BFI), PSI(PSI) { 440 // Query this against the original loop and save it here because the profile 441 // of the original loop header may change as the transformation happens. 442 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 443 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 444 } 445 446 virtual ~InnerLoopVectorizer() = default; 447 448 /// Create a new empty loop that will contain vectorized instructions later 449 /// on, while the old loop will be used as the scalar remainder. Control flow 450 /// is generated around the vectorized (and scalar epilogue) loops consisting 451 /// of various checks and bypasses. Return the pre-header block of the new 452 /// loop. 453 BasicBlock *createVectorizedLoopSkeleton(); 454 455 /// Widen a single instruction within the innermost loop. 456 void widenInstruction(Instruction &I, VPUser &Operands, 457 VPTransformState &State); 458 459 /// Widen a single call instruction within the innermost loop. 460 void widenCallInstruction(CallInst &I, VPUser &ArgOperands, 461 VPTransformState &State); 462 463 /// Widen a single select instruction within the innermost loop. 464 void widenSelectInstruction(SelectInst &I, VPUser &Operands, 465 bool InvariantCond, VPTransformState &State); 466 467 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 468 void fixVectorizedLoop(); 469 470 // Return true if any runtime check is added. 471 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 472 473 /// A type for vectorized values in the new loop. Each value from the 474 /// original loop, when vectorized, is represented by UF vector values in the 475 /// new unrolled loop, where UF is the unroll factor. 476 using VectorParts = SmallVector<Value *, 2>; 477 478 /// Vectorize a single GetElementPtrInst based on information gathered and 479 /// decisions taken during planning. 480 void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF, 481 ElementCount VF, bool IsPtrLoopInvariant, 482 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 483 484 /// Vectorize a single PHINode in a block. This method handles the induction 485 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 486 /// arbitrary length vectors. 487 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 488 489 /// A helper function to scalarize a single Instruction in the innermost loop. 490 /// Generates a sequence of scalar instances for each lane between \p MinLane 491 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 492 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 493 /// Instr's operands. 494 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 495 const VPIteration &Instance, bool IfPredicateInstr, 496 VPTransformState &State); 497 498 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 499 /// is provided, the integer induction variable will first be truncated to 500 /// the corresponding type. 501 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 502 503 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 504 /// vector or scalar value on-demand if one is not yet available. When 505 /// vectorizing a loop, we visit the definition of an instruction before its 506 /// uses. When visiting the definition, we either vectorize or scalarize the 507 /// instruction, creating an entry for it in the corresponding map. (In some 508 /// cases, such as induction variables, we will create both vector and scalar 509 /// entries.) Then, as we encounter uses of the definition, we derive values 510 /// for each scalar or vector use unless such a value is already available. 511 /// For example, if we scalarize a definition and one of its uses is vector, 512 /// we build the required vector on-demand with an insertelement sequence 513 /// when visiting the use. Otherwise, if the use is scalar, we can use the 514 /// existing scalar definition. 515 /// 516 /// Return a value in the new loop corresponding to \p V from the original 517 /// loop at unroll index \p Part. If the value has already been vectorized, 518 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 519 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 520 /// a new vector value on-demand by inserting the scalar values into a vector 521 /// with an insertelement sequence. If the value has been neither vectorized 522 /// nor scalarized, it must be loop invariant, so we simply broadcast the 523 /// value into a vector. 524 Value *getOrCreateVectorValue(Value *V, unsigned Part); 525 526 /// Return a value in the new loop corresponding to \p V from the original 527 /// loop at unroll and vector indices \p Instance. If the value has been 528 /// vectorized but not scalarized, the necessary extractelement instruction 529 /// will be generated. 530 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 531 532 /// Construct the vector value of a scalarized value \p V one lane at a time. 533 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 534 535 /// Try to vectorize interleaved access group \p Group with the base address 536 /// given in \p Addr, optionally masking the vector operations if \p 537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 538 /// values in the vectorized loop. 539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 540 VPTransformState &State, VPValue *Addr, 541 VPValue *BlockInMask = nullptr); 542 543 /// Vectorize Load and Store instructions with the base address given in \p 544 /// Addr, optionally masking the vector operations if \p BlockInMask is 545 /// non-null. Use \p State to translate given VPValues to IR values in the 546 /// vectorized loop. 547 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 548 VPValue *Addr, VPValue *StoredValue, 549 VPValue *BlockInMask); 550 551 /// Set the debug location in the builder using the debug location in 552 /// the instruction. 553 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 554 555 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 556 void fixNonInductionPHIs(void); 557 558 protected: 559 friend class LoopVectorizationPlanner; 560 561 /// A small list of PHINodes. 562 using PhiVector = SmallVector<PHINode *, 4>; 563 564 /// A type for scalarized values in the new loop. Each value from the 565 /// original loop, when scalarized, is represented by UF x VF scalar values 566 /// in the new unrolled loop, where UF is the unroll factor and VF is the 567 /// vectorization factor. 568 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 569 570 /// Set up the values of the IVs correctly when exiting the vector loop. 571 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 572 Value *CountRoundDown, Value *EndValue, 573 BasicBlock *MiddleBlock); 574 575 /// Create a new induction variable inside L. 576 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 577 Value *Step, Instruction *DL); 578 579 /// Handle all cross-iteration phis in the header. 580 void fixCrossIterationPHIs(); 581 582 /// Fix a first-order recurrence. This is the second phase of vectorizing 583 /// this phi node. 584 void fixFirstOrderRecurrence(PHINode *Phi); 585 586 /// Fix a reduction cross-iteration phi. This is the second phase of 587 /// vectorizing this phi node. 588 void fixReduction(PHINode *Phi); 589 590 /// Clear NSW/NUW flags from reduction instructions if necessary. 591 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 592 593 /// The Loop exit block may have single value PHI nodes with some 594 /// incoming value. While vectorizing we only handled real values 595 /// that were defined inside the loop and we should have one value for 596 /// each predecessor of its parent basic block. See PR14725. 597 void fixLCSSAPHIs(); 598 599 /// Iteratively sink the scalarized operands of a predicated instruction into 600 /// the block that was created for it. 601 void sinkScalarOperands(Instruction *PredInst); 602 603 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 604 /// represented as. 605 void truncateToMinimalBitwidths(); 606 607 /// Create a broadcast instruction. This method generates a broadcast 608 /// instruction (shuffle) for loop invariant values and for the induction 609 /// value. If this is the induction variable then we extend it to N, N+1, ... 610 /// this is needed because each iteration in the loop corresponds to a SIMD 611 /// element. 612 virtual Value *getBroadcastInstrs(Value *V); 613 614 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID); 628 629 /// Create a vector induction phi node based on an existing scalar one. \p 630 /// EntryVal is the value from the original loop that maps to the vector phi 631 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 632 /// truncate instruction, instead of widening the original IV, we widen a 633 /// version of the IV truncated to \p EntryVal's type. 634 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 635 Value *Step, Instruction *EntryVal); 636 637 /// Returns true if an instruction \p I should be scalarized instead of 638 /// vectorized for the chosen vectorization factor. 639 bool shouldScalarizeInstruction(Instruction *I) const; 640 641 /// Returns true if we should generate a scalar version of \p IV. 642 bool needsScalarInduction(Instruction *IV) const; 643 644 /// If there is a cast involved in the induction variable \p ID, which should 645 /// be ignored in the vectorized loop body, this function records the 646 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 647 /// cast. We had already proved that the casted Phi is equal to the uncasted 648 /// Phi in the vectorized loop (under a runtime guard), and therefore 649 /// there is no need to vectorize the cast - the same value can be used in the 650 /// vector loop for both the Phi and the cast. 651 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 652 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 653 /// 654 /// \p EntryVal is the value from the original loop that maps to the vector 655 /// phi node and is used to distinguish what is the IV currently being 656 /// processed - original one (if \p EntryVal is a phi corresponding to the 657 /// original IV) or the "newly-created" one based on the proof mentioned above 658 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 659 /// latter case \p EntryVal is a TruncInst and we must not record anything for 660 /// that IV, but it's error-prone to expect callers of this routine to care 661 /// about that, hence this explicit parameter. 662 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 663 const Instruction *EntryVal, 664 Value *VectorLoopValue, 665 unsigned Part, 666 unsigned Lane = UINT_MAX); 667 668 /// Generate a shuffle sequence that will reverse the vector Vec. 669 virtual Value *reverseVector(Value *Vec); 670 671 /// Returns (and creates if needed) the original loop trip count. 672 Value *getOrCreateTripCount(Loop *NewLoop); 673 674 /// Returns (and creates if needed) the trip count of the widened loop. 675 Value *getOrCreateVectorTripCount(Loop *NewLoop); 676 677 /// Returns a bitcasted value to the requested vector type. 678 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 679 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 680 const DataLayout &DL); 681 682 /// Emit a bypass check to see if the vector trip count is zero, including if 683 /// it overflows. 684 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 685 686 /// Emit a bypass check to see if all of the SCEV assumptions we've 687 /// had to make are correct. 688 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 689 690 /// Emit bypass checks to check any memory assumptions we may have made. 691 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 692 693 /// Compute the transformed value of Index at offset StartValue using step 694 /// StepValue. 695 /// For integer induction, returns StartValue + Index * StepValue. 696 /// For pointer induction, returns StartValue[Index * StepValue]. 697 /// FIXME: The newly created binary instructions should contain nsw/nuw 698 /// flags, which can be found from the original scalar operations. 699 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 700 const DataLayout &DL, 701 const InductionDescriptor &ID) const; 702 703 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 704 /// vector loop preheader, middle block and scalar preheader. Also 705 /// allocate a loop object for the new vector loop and return it. 706 Loop *createVectorLoopSkeleton(StringRef Prefix); 707 708 /// Create new phi nodes for the induction variables to resume iteration count 709 /// in the scalar epilogue, from where the vectorized loop left off (given by 710 /// \p VectorTripCount). 711 void createInductionResumeValues(Loop *L, Value *VectorTripCount); 712 713 /// Complete the loop skeleton by adding debug MDs, creating appropriate 714 /// conditional branches in the middle block, preparing the builder and 715 /// running the verifier. Take in the vector loop \p L as argument, and return 716 /// the preheader of the completed vector loop. 717 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 718 719 /// Add additional metadata to \p To that was not present on \p Orig. 720 /// 721 /// Currently this is used to add the noalias annotations based on the 722 /// inserted memchecks. Use this for instructions that are *cloned* into the 723 /// vector loop. 724 void addNewMetadata(Instruction *To, const Instruction *Orig); 725 726 /// Add metadata from one instruction to another. 727 /// 728 /// This includes both the original MDs from \p From and additional ones (\see 729 /// addNewMetadata). Use this for *newly created* instructions in the vector 730 /// loop. 731 void addMetadata(Instruction *To, Instruction *From); 732 733 /// Similar to the previous function but it adds the metadata to a 734 /// vector of instructions. 735 void addMetadata(ArrayRef<Value *> To, Instruction *From); 736 737 /// The original loop. 738 Loop *OrigLoop; 739 740 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 741 /// dynamic knowledge to simplify SCEV expressions and converts them to a 742 /// more usable form. 743 PredicatedScalarEvolution &PSE; 744 745 /// Loop Info. 746 LoopInfo *LI; 747 748 /// Dominator Tree. 749 DominatorTree *DT; 750 751 /// Alias Analysis. 752 AAResults *AA; 753 754 /// Target Library Info. 755 const TargetLibraryInfo *TLI; 756 757 /// Target Transform Info. 758 const TargetTransformInfo *TTI; 759 760 /// Assumption Cache. 761 AssumptionCache *AC; 762 763 /// Interface to emit optimization remarks. 764 OptimizationRemarkEmitter *ORE; 765 766 /// LoopVersioning. It's only set up (non-null) if memchecks were 767 /// used. 768 /// 769 /// This is currently only used to add no-alias metadata based on the 770 /// memchecks. The actually versioning is performed manually. 771 std::unique_ptr<LoopVersioning> LVer; 772 773 /// The vectorization SIMD factor to use. Each vector will have this many 774 /// vector elements. 775 ElementCount VF; 776 777 /// The vectorization unroll factor to use. Each scalar is vectorized to this 778 /// many different vector instructions. 779 unsigned UF; 780 781 /// The builder that we use 782 IRBuilder<> Builder; 783 784 // --- Vectorization state --- 785 786 /// The vector-loop preheader. 787 BasicBlock *LoopVectorPreHeader; 788 789 /// The scalar-loop preheader. 790 BasicBlock *LoopScalarPreHeader; 791 792 /// Middle Block between the vector and the scalar. 793 BasicBlock *LoopMiddleBlock; 794 795 /// The ExitBlock of the scalar loop. 796 BasicBlock *LoopExitBlock; 797 798 /// The vector loop body. 799 BasicBlock *LoopVectorBody; 800 801 /// The scalar loop body. 802 BasicBlock *LoopScalarBody; 803 804 /// A list of all bypass blocks. The first block is the entry of the loop. 805 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 806 807 /// The new Induction variable which was added to the new block. 808 PHINode *Induction = nullptr; 809 810 /// The induction variable of the old basic block. 811 PHINode *OldInduction = nullptr; 812 813 /// Maps values from the original loop to their corresponding values in the 814 /// vectorized loop. A key value can map to either vector values, scalar 815 /// values or both kinds of values, depending on whether the key was 816 /// vectorized and scalarized. 817 VectorizerValueMap VectorLoopValueMap; 818 819 /// Store instructions that were predicated. 820 SmallVector<Instruction *, 4> PredicatedInstructions; 821 822 /// Trip count of the original loop. 823 Value *TripCount = nullptr; 824 825 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 826 Value *VectorTripCount = nullptr; 827 828 /// The legality analysis. 829 LoopVectorizationLegality *Legal; 830 831 /// The profitablity analysis. 832 LoopVectorizationCostModel *Cost; 833 834 // Record whether runtime checks are added. 835 bool AddedSafetyChecks = false; 836 837 // Holds the end values for each induction variable. We save the end values 838 // so we can later fix-up the external users of the induction variables. 839 DenseMap<PHINode *, Value *> IVEndValues; 840 841 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 842 // fixed up at the end of vector code generation. 843 SmallVector<PHINode *, 8> OrigPHIsToFix; 844 845 /// BFI and PSI are used to check for profile guided size optimizations. 846 BlockFrequencyInfo *BFI; 847 ProfileSummaryInfo *PSI; 848 849 // Whether this loop should be optimized for size based on profile guided size 850 // optimizatios. 851 bool OptForSizeBasedOnProfile; 852 }; 853 854 class InnerLoopUnroller : public InnerLoopVectorizer { 855 public: 856 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 857 LoopInfo *LI, DominatorTree *DT, 858 const TargetLibraryInfo *TLI, 859 const TargetTransformInfo *TTI, AssumptionCache *AC, 860 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 861 LoopVectorizationLegality *LVL, 862 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 863 ProfileSummaryInfo *PSI) 864 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 865 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 866 BFI, PSI) {} 867 868 private: 869 Value *getBroadcastInstrs(Value *V) override; 870 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 871 Instruction::BinaryOps Opcode = 872 Instruction::BinaryOpsEnd) override; 873 Value *reverseVector(Value *Vec) override; 874 }; 875 876 } // end namespace llvm 877 878 /// Look for a meaningful debug location on the instruction or it's 879 /// operands. 880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 881 if (!I) 882 return I; 883 884 DebugLoc Empty; 885 if (I->getDebugLoc() != Empty) 886 return I; 887 888 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 889 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 890 if (OpInst->getDebugLoc() != Empty) 891 return OpInst; 892 } 893 894 return I; 895 } 896 897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 898 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 899 const DILocation *DIL = Inst->getDebugLoc(); 900 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 901 !isa<DbgInfoIntrinsic>(Inst)) { 902 assert(!VF.isScalable() && "scalable vectors not yet supported."); 903 auto NewDIL = 904 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 905 if (NewDIL) 906 B.SetCurrentDebugLocation(NewDIL.getValue()); 907 else 908 LLVM_DEBUG(dbgs() 909 << "Failed to create new discriminator: " 910 << DIL->getFilename() << " Line: " << DIL->getLine()); 911 } 912 else 913 B.SetCurrentDebugLocation(DIL); 914 } else 915 B.SetCurrentDebugLocation(DebugLoc()); 916 } 917 918 /// Write a record \p DebugMsg about vectorization failure to the debug 919 /// output stream. If \p I is passed, it is an instruction that prevents 920 /// vectorization. 921 #ifndef NDEBUG 922 static void debugVectorizationFailure(const StringRef DebugMsg, 923 Instruction *I) { 924 dbgs() << "LV: Not vectorizing: " << DebugMsg; 925 if (I != nullptr) 926 dbgs() << " " << *I; 927 else 928 dbgs() << '.'; 929 dbgs() << '\n'; 930 } 931 #endif 932 933 /// Create an analysis remark that explains why vectorization failed 934 /// 935 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 936 /// RemarkName is the identifier for the remark. If \p I is passed it is an 937 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 938 /// the location of the remark. \return the remark object that can be 939 /// streamed to. 940 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 941 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 942 Value *CodeRegion = TheLoop->getHeader(); 943 DebugLoc DL = TheLoop->getStartLoc(); 944 945 if (I) { 946 CodeRegion = I->getParent(); 947 // If there is no debug location attached to the instruction, revert back to 948 // using the loop's. 949 if (I->getDebugLoc()) 950 DL = I->getDebugLoc(); 951 } 952 953 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 954 R << "loop not vectorized: "; 955 return R; 956 } 957 958 namespace llvm { 959 960 void reportVectorizationFailure(const StringRef DebugMsg, 961 const StringRef OREMsg, const StringRef ORETag, 962 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 963 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 964 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 965 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 966 ORETag, TheLoop, I) << OREMsg); 967 } 968 969 } // end namespace llvm 970 971 #ifndef NDEBUG 972 /// \return string containing a file name and a line # for the given loop. 973 static std::string getDebugLocString(const Loop *L) { 974 std::string Result; 975 if (L) { 976 raw_string_ostream OS(Result); 977 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 978 LoopDbgLoc.print(OS); 979 else 980 // Just print the module name. 981 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 982 OS.flush(); 983 } 984 return Result; 985 } 986 #endif 987 988 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 989 const Instruction *Orig) { 990 // If the loop was versioned with memchecks, add the corresponding no-alias 991 // metadata. 992 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 993 LVer->annotateInstWithNoAlias(To, Orig); 994 } 995 996 void InnerLoopVectorizer::addMetadata(Instruction *To, 997 Instruction *From) { 998 propagateMetadata(To, From); 999 addNewMetadata(To, From); 1000 } 1001 1002 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1003 Instruction *From) { 1004 for (Value *V : To) { 1005 if (Instruction *I = dyn_cast<Instruction>(V)) 1006 addMetadata(I, From); 1007 } 1008 } 1009 1010 namespace llvm { 1011 1012 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1013 // lowered. 1014 enum ScalarEpilogueLowering { 1015 1016 // The default: allowing scalar epilogues. 1017 CM_ScalarEpilogueAllowed, 1018 1019 // Vectorization with OptForSize: don't allow epilogues. 1020 CM_ScalarEpilogueNotAllowedOptSize, 1021 1022 // A special case of vectorisation with OptForSize: loops with a very small 1023 // trip count are considered for vectorization under OptForSize, thereby 1024 // making sure the cost of their loop body is dominant, free of runtime 1025 // guards and scalar iteration overheads. 1026 CM_ScalarEpilogueNotAllowedLowTripLoop, 1027 1028 // Loop hint predicate indicating an epilogue is undesired. 1029 CM_ScalarEpilogueNotNeededUsePredicate 1030 }; 1031 1032 /// LoopVectorizationCostModel - estimates the expected speedups due to 1033 /// vectorization. 1034 /// In many cases vectorization is not profitable. This can happen because of 1035 /// a number of reasons. In this class we mainly attempt to predict the 1036 /// expected speedup/slowdowns due to the supported instruction set. We use the 1037 /// TargetTransformInfo to query the different backends for the cost of 1038 /// different operations. 1039 class LoopVectorizationCostModel { 1040 public: 1041 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1042 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1043 LoopVectorizationLegality *Legal, 1044 const TargetTransformInfo &TTI, 1045 const TargetLibraryInfo *TLI, DemandedBits *DB, 1046 AssumptionCache *AC, 1047 OptimizationRemarkEmitter *ORE, const Function *F, 1048 const LoopVectorizeHints *Hints, 1049 InterleavedAccessInfo &IAI) 1050 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1051 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1052 Hints(Hints), InterleaveInfo(IAI) {} 1053 1054 /// \return An upper bound for the vectorization factor, or None if 1055 /// vectorization and interleaving should be avoided up front. 1056 Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC); 1057 1058 /// \return True if runtime checks are required for vectorization, and false 1059 /// otherwise. 1060 bool runtimeChecksRequired(); 1061 1062 /// \return The most profitable vectorization factor and the cost of that VF. 1063 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1064 /// then this vectorization factor will be selected if vectorization is 1065 /// possible. 1066 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 1067 1068 /// Setup cost-based decisions for user vectorization factor. 1069 void selectUserVectorizationFactor(ElementCount UserVF) { 1070 collectUniformsAndScalars(UserVF); 1071 collectInstsToScalarize(UserVF); 1072 } 1073 1074 /// \return The size (in bits) of the smallest and widest types in the code 1075 /// that needs to be vectorized. We ignore values that remain scalar such as 1076 /// 64 bit loop indices. 1077 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1078 1079 /// \return The desired interleave count. 1080 /// If interleave count has been specified by metadata it will be returned. 1081 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1082 /// are the selected vectorization factor and the cost of the selected VF. 1083 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1084 1085 /// Memory access instruction may be vectorized in more than one way. 1086 /// Form of instruction after vectorization depends on cost. 1087 /// This function takes cost-based decisions for Load/Store instructions 1088 /// and collects them in a map. This decisions map is used for building 1089 /// the lists of loop-uniform and loop-scalar instructions. 1090 /// The calculated cost is saved with widening decision in order to 1091 /// avoid redundant calculations. 1092 void setCostBasedWideningDecision(ElementCount VF); 1093 1094 /// A struct that represents some properties of the register usage 1095 /// of a loop. 1096 struct RegisterUsage { 1097 /// Holds the number of loop invariant values that are used in the loop. 1098 /// The key is ClassID of target-provided register class. 1099 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1100 /// Holds the maximum number of concurrent live intervals in the loop. 1101 /// The key is ClassID of target-provided register class. 1102 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1103 }; 1104 1105 /// \return Returns information about the register usages of the loop for the 1106 /// given vectorization factors. 1107 SmallVector<RegisterUsage, 8> 1108 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1109 1110 /// Collect values we want to ignore in the cost model. 1111 void collectValuesToIgnore(); 1112 1113 /// Split reductions into those that happen in the loop, and those that happen 1114 /// outside. In loop reductions are collected into InLoopReductionChains. 1115 void collectInLoopReductions(); 1116 1117 /// \returns The smallest bitwidth each instruction can be represented with. 1118 /// The vector equivalents of these instructions should be truncated to this 1119 /// type. 1120 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1121 return MinBWs; 1122 } 1123 1124 /// \returns True if it is more profitable to scalarize instruction \p I for 1125 /// vectorization factor \p VF. 1126 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1127 assert(VF.isVector() && 1128 "Profitable to scalarize relevant only for VF > 1."); 1129 1130 // Cost model is not run in the VPlan-native path - return conservative 1131 // result until this changes. 1132 if (EnableVPlanNativePath) 1133 return false; 1134 1135 auto Scalars = InstsToScalarize.find(VF); 1136 assert(Scalars != InstsToScalarize.end() && 1137 "VF not yet analyzed for scalarization profitability"); 1138 return Scalars->second.find(I) != Scalars->second.end(); 1139 } 1140 1141 /// Returns true if \p I is known to be uniform after vectorization. 1142 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1143 if (VF.isScalar()) 1144 return true; 1145 1146 // Cost model is not run in the VPlan-native path - return conservative 1147 // result until this changes. 1148 if (EnableVPlanNativePath) 1149 return false; 1150 1151 auto UniformsPerVF = Uniforms.find(VF); 1152 assert(UniformsPerVF != Uniforms.end() && 1153 "VF not yet analyzed for uniformity"); 1154 return UniformsPerVF->second.count(I); 1155 } 1156 1157 /// Returns true if \p I is known to be scalar after vectorization. 1158 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1159 if (VF.isScalar()) 1160 return true; 1161 1162 // Cost model is not run in the VPlan-native path - return conservative 1163 // result until this changes. 1164 if (EnableVPlanNativePath) 1165 return false; 1166 1167 auto ScalarsPerVF = Scalars.find(VF); 1168 assert(ScalarsPerVF != Scalars.end() && 1169 "Scalar values are not calculated for VF"); 1170 return ScalarsPerVF->second.count(I); 1171 } 1172 1173 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1174 /// for vectorization factor \p VF. 1175 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1176 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1177 !isProfitableToScalarize(I, VF) && 1178 !isScalarAfterVectorization(I, VF); 1179 } 1180 1181 /// Decision that was taken during cost calculation for memory instruction. 1182 enum InstWidening { 1183 CM_Unknown, 1184 CM_Widen, // For consecutive accesses with stride +1. 1185 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1186 CM_Interleave, 1187 CM_GatherScatter, 1188 CM_Scalarize 1189 }; 1190 1191 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1192 /// instruction \p I and vector width \p VF. 1193 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1194 unsigned Cost) { 1195 assert(VF.isVector() && "Expected VF >=2"); 1196 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1197 } 1198 1199 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1200 /// interleaving group \p Grp and vector width \p VF. 1201 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1202 ElementCount VF, InstWidening W, unsigned Cost) { 1203 assert(VF.isVector() && "Expected VF >=2"); 1204 /// Broadcast this decicion to all instructions inside the group. 1205 /// But the cost will be assigned to one instruction only. 1206 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1207 if (auto *I = Grp->getMember(i)) { 1208 if (Grp->getInsertPos() == I) 1209 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1210 else 1211 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1212 } 1213 } 1214 } 1215 1216 /// Return the cost model decision for the given instruction \p I and vector 1217 /// width \p VF. Return CM_Unknown if this instruction did not pass 1218 /// through the cost modeling. 1219 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1220 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1221 assert(VF.isVector() && "Expected VF >=2"); 1222 1223 // Cost model is not run in the VPlan-native path - return conservative 1224 // result until this changes. 1225 if (EnableVPlanNativePath) 1226 return CM_GatherScatter; 1227 1228 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1229 auto Itr = WideningDecisions.find(InstOnVF); 1230 if (Itr == WideningDecisions.end()) 1231 return CM_Unknown; 1232 return Itr->second.first; 1233 } 1234 1235 /// Return the vectorization cost for the given instruction \p I and vector 1236 /// width \p VF. 1237 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1238 assert(VF.isVector() && "Expected VF >=2"); 1239 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1240 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1241 "The cost is not calculated"); 1242 return WideningDecisions[InstOnVF].second; 1243 } 1244 1245 /// Return True if instruction \p I is an optimizable truncate whose operand 1246 /// is an induction variable. Such a truncate will be removed by adding a new 1247 /// induction variable with the destination type. 1248 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1249 // If the instruction is not a truncate, return false. 1250 auto *Trunc = dyn_cast<TruncInst>(I); 1251 if (!Trunc) 1252 return false; 1253 1254 // Get the source and destination types of the truncate. 1255 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1256 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1257 1258 // If the truncate is free for the given types, return false. Replacing a 1259 // free truncate with an induction variable would add an induction variable 1260 // update instruction to each iteration of the loop. We exclude from this 1261 // check the primary induction variable since it will need an update 1262 // instruction regardless. 1263 Value *Op = Trunc->getOperand(0); 1264 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1265 return false; 1266 1267 // If the truncated value is not an induction variable, return false. 1268 return Legal->isInductionPhi(Op); 1269 } 1270 1271 /// Collects the instructions to scalarize for each predicated instruction in 1272 /// the loop. 1273 void collectInstsToScalarize(ElementCount VF); 1274 1275 /// Collect Uniform and Scalar values for the given \p VF. 1276 /// The sets depend on CM decision for Load/Store instructions 1277 /// that may be vectorized as interleave, gather-scatter or scalarized. 1278 void collectUniformsAndScalars(ElementCount VF) { 1279 // Do the analysis once. 1280 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1281 return; 1282 setCostBasedWideningDecision(VF); 1283 collectLoopUniforms(VF); 1284 collectLoopScalars(VF); 1285 } 1286 1287 /// Returns true if the target machine supports masked store operation 1288 /// for the given \p DataType and kind of access to \p Ptr. 1289 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1290 return Legal->isConsecutivePtr(Ptr) && 1291 TTI.isLegalMaskedStore(DataType, Alignment); 1292 } 1293 1294 /// Returns true if the target machine supports masked load operation 1295 /// for the given \p DataType and kind of access to \p Ptr. 1296 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1297 return Legal->isConsecutivePtr(Ptr) && 1298 TTI.isLegalMaskedLoad(DataType, Alignment); 1299 } 1300 1301 /// Returns true if the target machine supports masked scatter operation 1302 /// for the given \p DataType. 1303 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1304 return TTI.isLegalMaskedScatter(DataType, Alignment); 1305 } 1306 1307 /// Returns true if the target machine supports masked gather operation 1308 /// for the given \p DataType. 1309 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1310 return TTI.isLegalMaskedGather(DataType, Alignment); 1311 } 1312 1313 /// Returns true if the target machine can represent \p V as a masked gather 1314 /// or scatter operation. 1315 bool isLegalGatherOrScatter(Value *V) { 1316 bool LI = isa<LoadInst>(V); 1317 bool SI = isa<StoreInst>(V); 1318 if (!LI && !SI) 1319 return false; 1320 auto *Ty = getMemInstValueType(V); 1321 Align Align = getLoadStoreAlignment(V); 1322 return (LI && isLegalMaskedGather(Ty, Align)) || 1323 (SI && isLegalMaskedScatter(Ty, Align)); 1324 } 1325 1326 /// Returns true if \p I is an instruction that will be scalarized with 1327 /// predication. Such instructions include conditional stores and 1328 /// instructions that may divide by zero. 1329 /// If a non-zero VF has been calculated, we check if I will be scalarized 1330 /// predication for that VF. 1331 bool isScalarWithPredication(Instruction *I, 1332 ElementCount VF = ElementCount::getFixed(1)); 1333 1334 // Returns true if \p I is an instruction that will be predicated either 1335 // through scalar predication or masked load/store or masked gather/scatter. 1336 // Superset of instructions that return true for isScalarWithPredication. 1337 bool isPredicatedInst(Instruction *I) { 1338 if (!blockNeedsPredication(I->getParent())) 1339 return false; 1340 // Loads and stores that need some form of masked operation are predicated 1341 // instructions. 1342 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1343 return Legal->isMaskRequired(I); 1344 return isScalarWithPredication(I); 1345 } 1346 1347 /// Returns true if \p I is a memory instruction with consecutive memory 1348 /// access that can be widened. 1349 bool 1350 memoryInstructionCanBeWidened(Instruction *I, 1351 ElementCount VF = ElementCount::getFixed(1)); 1352 1353 /// Returns true if \p I is a memory instruction in an interleaved-group 1354 /// of memory accesses that can be vectorized with wide vector loads/stores 1355 /// and shuffles. 1356 bool 1357 interleavedAccessCanBeWidened(Instruction *I, 1358 ElementCount VF = ElementCount::getFixed(1)); 1359 1360 /// Check if \p Instr belongs to any interleaved access group. 1361 bool isAccessInterleaved(Instruction *Instr) { 1362 return InterleaveInfo.isInterleaved(Instr); 1363 } 1364 1365 /// Get the interleaved access group that \p Instr belongs to. 1366 const InterleaveGroup<Instruction> * 1367 getInterleavedAccessGroup(Instruction *Instr) { 1368 return InterleaveInfo.getInterleaveGroup(Instr); 1369 } 1370 1371 /// Returns true if an interleaved group requires a scalar iteration 1372 /// to handle accesses with gaps, and there is nothing preventing us from 1373 /// creating a scalar epilogue. 1374 bool requiresScalarEpilogue() const { 1375 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1376 } 1377 1378 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1379 /// loop hint annotation. 1380 bool isScalarEpilogueAllowed() const { 1381 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1382 } 1383 1384 /// Returns true if all loop blocks should be masked to fold tail loop. 1385 bool foldTailByMasking() const { return FoldTailByMasking; } 1386 1387 bool blockNeedsPredication(BasicBlock *BB) { 1388 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1389 } 1390 1391 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1392 /// nodes to the chain of instructions representing the reductions. Uses a 1393 /// MapVector to ensure deterministic iteration order. 1394 using ReductionChainMap = 1395 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1396 1397 /// Return the chain of instructions representing an inloop reduction. 1398 const ReductionChainMap &getInLoopReductionChains() const { 1399 return InLoopReductionChains; 1400 } 1401 1402 /// Returns true if the Phi is part of an inloop reduction. 1403 bool isInLoopReduction(PHINode *Phi) const { 1404 return InLoopReductionChains.count(Phi); 1405 } 1406 1407 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1408 /// with factor VF. Return the cost of the instruction, including 1409 /// scalarization overhead if it's needed. 1410 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1411 1412 /// Estimate cost of a call instruction CI if it were vectorized with factor 1413 /// VF. Return the cost of the instruction, including scalarization overhead 1414 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1415 /// scalarized - 1416 /// i.e. either vector version isn't available, or is too expensive. 1417 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1418 bool &NeedToScalarize); 1419 1420 /// Invalidates decisions already taken by the cost model. 1421 void invalidateCostModelingDecisions() { 1422 WideningDecisions.clear(); 1423 Uniforms.clear(); 1424 Scalars.clear(); 1425 } 1426 1427 private: 1428 unsigned NumPredStores = 0; 1429 1430 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1431 /// than zero. One is returned if vectorization should best be avoided due 1432 /// to cost. 1433 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1434 1435 /// The vectorization cost is a combination of the cost itself and a boolean 1436 /// indicating whether any of the contributing operations will actually 1437 /// operate on 1438 /// vector values after type legalization in the backend. If this latter value 1439 /// is 1440 /// false, then all operations will be scalarized (i.e. no vectorization has 1441 /// actually taken place). 1442 using VectorizationCostTy = std::pair<unsigned, bool>; 1443 1444 /// Returns the expected execution cost. The unit of the cost does 1445 /// not matter because we use the 'cost' units to compare different 1446 /// vector widths. The cost that is returned is *not* normalized by 1447 /// the factor width. 1448 VectorizationCostTy expectedCost(ElementCount VF); 1449 1450 /// Returns the execution time cost of an instruction for a given vector 1451 /// width. Vector width of one means scalar. 1452 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1453 1454 /// The cost-computation logic from getInstructionCost which provides 1455 /// the vector type as an output parameter. 1456 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1457 1458 /// Calculate vectorization cost of memory instruction \p I. 1459 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1460 1461 /// The cost computation for scalarized memory instruction. 1462 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1463 1464 /// The cost computation for interleaving group of memory instructions. 1465 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1466 1467 /// The cost computation for Gather/Scatter instruction. 1468 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1469 1470 /// The cost computation for widening instruction \p I with consecutive 1471 /// memory access. 1472 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1473 1474 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1475 /// Load: scalar load + broadcast. 1476 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1477 /// element) 1478 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1479 1480 /// Estimate the overhead of scalarizing an instruction. This is a 1481 /// convenience wrapper for the type-based getScalarizationOverhead API. 1482 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1483 1484 /// Returns whether the instruction is a load or store and will be a emitted 1485 /// as a vector operation. 1486 bool isConsecutiveLoadOrStore(Instruction *I); 1487 1488 /// Returns true if an artificially high cost for emulated masked memrefs 1489 /// should be used. 1490 bool useEmulatedMaskMemRefHack(Instruction *I); 1491 1492 /// Map of scalar integer values to the smallest bitwidth they can be legally 1493 /// represented as. The vector equivalents of these values should be truncated 1494 /// to this type. 1495 MapVector<Instruction *, uint64_t> MinBWs; 1496 1497 /// A type representing the costs for instructions if they were to be 1498 /// scalarized rather than vectorized. The entries are Instruction-Cost 1499 /// pairs. 1500 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1501 1502 /// A set containing all BasicBlocks that are known to present after 1503 /// vectorization as a predicated block. 1504 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1505 1506 /// Records whether it is allowed to have the original scalar loop execute at 1507 /// least once. This may be needed as a fallback loop in case runtime 1508 /// aliasing/dependence checks fail, or to handle the tail/remainder 1509 /// iterations when the trip count is unknown or doesn't divide by the VF, 1510 /// or as a peel-loop to handle gaps in interleave-groups. 1511 /// Under optsize and when the trip count is very small we don't allow any 1512 /// iterations to execute in the scalar loop. 1513 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1514 1515 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1516 bool FoldTailByMasking = false; 1517 1518 /// A map holding scalar costs for different vectorization factors. The 1519 /// presence of a cost for an instruction in the mapping indicates that the 1520 /// instruction will be scalarized when vectorizing with the associated 1521 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1522 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1523 1524 /// Holds the instructions known to be uniform after vectorization. 1525 /// The data is collected per VF. 1526 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1527 1528 /// Holds the instructions known to be scalar after vectorization. 1529 /// The data is collected per VF. 1530 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1531 1532 /// Holds the instructions (address computations) that are forced to be 1533 /// scalarized. 1534 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1535 1536 /// PHINodes of the reductions that should be expanded in-loop along with 1537 /// their associated chains of reduction operations, in program order from top 1538 /// (PHI) to bottom 1539 ReductionChainMap InLoopReductionChains; 1540 1541 /// Returns the expected difference in cost from scalarizing the expression 1542 /// feeding a predicated instruction \p PredInst. The instructions to 1543 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1544 /// non-negative return value implies the expression will be scalarized. 1545 /// Currently, only single-use chains are considered for scalarization. 1546 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1547 ElementCount VF); 1548 1549 /// Collect the instructions that are uniform after vectorization. An 1550 /// instruction is uniform if we represent it with a single scalar value in 1551 /// the vectorized loop corresponding to each vector iteration. Examples of 1552 /// uniform instructions include pointer operands of consecutive or 1553 /// interleaved memory accesses. Note that although uniformity implies an 1554 /// instruction will be scalar, the reverse is not true. In general, a 1555 /// scalarized instruction will be represented by VF scalar values in the 1556 /// vectorized loop, each corresponding to an iteration of the original 1557 /// scalar loop. 1558 void collectLoopUniforms(ElementCount VF); 1559 1560 /// Collect the instructions that are scalar after vectorization. An 1561 /// instruction is scalar if it is known to be uniform or will be scalarized 1562 /// during vectorization. Non-uniform scalarized instructions will be 1563 /// represented by VF values in the vectorized loop, each corresponding to an 1564 /// iteration of the original scalar loop. 1565 void collectLoopScalars(ElementCount VF); 1566 1567 /// Keeps cost model vectorization decision and cost for instructions. 1568 /// Right now it is used for memory instructions only. 1569 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1570 std::pair<InstWidening, unsigned>>; 1571 1572 DecisionList WideningDecisions; 1573 1574 /// Returns true if \p V is expected to be vectorized and it needs to be 1575 /// extracted. 1576 bool needsExtract(Value *V, ElementCount VF) const { 1577 Instruction *I = dyn_cast<Instruction>(V); 1578 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1579 TheLoop->isLoopInvariant(I)) 1580 return false; 1581 1582 // Assume we can vectorize V (and hence we need extraction) if the 1583 // scalars are not computed yet. This can happen, because it is called 1584 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1585 // the scalars are collected. That should be a safe assumption in most 1586 // cases, because we check if the operands have vectorizable types 1587 // beforehand in LoopVectorizationLegality. 1588 return Scalars.find(VF) == Scalars.end() || 1589 !isScalarAfterVectorization(I, VF); 1590 }; 1591 1592 /// Returns a range containing only operands needing to be extracted. 1593 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1594 ElementCount VF) { 1595 return SmallVector<Value *, 4>(make_filter_range( 1596 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1597 } 1598 1599 public: 1600 /// The loop that we evaluate. 1601 Loop *TheLoop; 1602 1603 /// Predicated scalar evolution analysis. 1604 PredicatedScalarEvolution &PSE; 1605 1606 /// Loop Info analysis. 1607 LoopInfo *LI; 1608 1609 /// Vectorization legality. 1610 LoopVectorizationLegality *Legal; 1611 1612 /// Vector target information. 1613 const TargetTransformInfo &TTI; 1614 1615 /// Target Library Info. 1616 const TargetLibraryInfo *TLI; 1617 1618 /// Demanded bits analysis. 1619 DemandedBits *DB; 1620 1621 /// Assumption cache. 1622 AssumptionCache *AC; 1623 1624 /// Interface to emit optimization remarks. 1625 OptimizationRemarkEmitter *ORE; 1626 1627 const Function *TheFunction; 1628 1629 /// Loop Vectorize Hint. 1630 const LoopVectorizeHints *Hints; 1631 1632 /// The interleave access information contains groups of interleaved accesses 1633 /// with the same stride and close to each other. 1634 InterleavedAccessInfo &InterleaveInfo; 1635 1636 /// Values to ignore in the cost model. 1637 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1638 1639 /// Values to ignore in the cost model when VF > 1. 1640 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1641 }; 1642 1643 } // end namespace llvm 1644 1645 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1646 // vectorization. The loop needs to be annotated with #pragma omp simd 1647 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1648 // vector length information is not provided, vectorization is not considered 1649 // explicit. Interleave hints are not allowed either. These limitations will be 1650 // relaxed in the future. 1651 // Please, note that we are currently forced to abuse the pragma 'clang 1652 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1653 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1654 // provides *explicit vectorization hints* (LV can bypass legal checks and 1655 // assume that vectorization is legal). However, both hints are implemented 1656 // using the same metadata (llvm.loop.vectorize, processed by 1657 // LoopVectorizeHints). This will be fixed in the future when the native IR 1658 // representation for pragma 'omp simd' is introduced. 1659 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1660 OptimizationRemarkEmitter *ORE) { 1661 assert(!OuterLp->empty() && "This is not an outer loop"); 1662 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1663 1664 // Only outer loops with an explicit vectorization hint are supported. 1665 // Unannotated outer loops are ignored. 1666 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1667 return false; 1668 1669 Function *Fn = OuterLp->getHeader()->getParent(); 1670 if (!Hints.allowVectorization(Fn, OuterLp, 1671 true /*VectorizeOnlyWhenForced*/)) { 1672 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1673 return false; 1674 } 1675 1676 if (Hints.getInterleave() > 1) { 1677 // TODO: Interleave support is future work. 1678 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1679 "outer loops.\n"); 1680 Hints.emitRemarkWithHints(); 1681 return false; 1682 } 1683 1684 return true; 1685 } 1686 1687 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1688 OptimizationRemarkEmitter *ORE, 1689 SmallVectorImpl<Loop *> &V) { 1690 // Collect inner loops and outer loops without irreducible control flow. For 1691 // now, only collect outer loops that have explicit vectorization hints. If we 1692 // are stress testing the VPlan H-CFG construction, we collect the outermost 1693 // loop of every loop nest. 1694 if (L.empty() || VPlanBuildStressTest || 1695 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1696 LoopBlocksRPO RPOT(&L); 1697 RPOT.perform(LI); 1698 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1699 V.push_back(&L); 1700 // TODO: Collect inner loops inside marked outer loops in case 1701 // vectorization fails for the outer loop. Do not invoke 1702 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1703 // already known to be reducible. We can use an inherited attribute for 1704 // that. 1705 return; 1706 } 1707 } 1708 for (Loop *InnerL : L) 1709 collectSupportedLoops(*InnerL, LI, ORE, V); 1710 } 1711 1712 namespace { 1713 1714 /// The LoopVectorize Pass. 1715 struct LoopVectorize : public FunctionPass { 1716 /// Pass identification, replacement for typeid 1717 static char ID; 1718 1719 LoopVectorizePass Impl; 1720 1721 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1722 bool VectorizeOnlyWhenForced = false) 1723 : FunctionPass(ID), 1724 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1725 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1726 } 1727 1728 bool runOnFunction(Function &F) override { 1729 if (skipFunction(F)) 1730 return false; 1731 1732 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1733 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1734 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1735 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1736 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1737 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1738 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1739 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1740 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1741 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1742 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1743 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1744 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1745 1746 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1747 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1748 1749 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1750 GetLAA, *ORE, PSI).MadeAnyChange; 1751 } 1752 1753 void getAnalysisUsage(AnalysisUsage &AU) const override { 1754 AU.addRequired<AssumptionCacheTracker>(); 1755 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1756 AU.addRequired<DominatorTreeWrapperPass>(); 1757 AU.addRequired<LoopInfoWrapperPass>(); 1758 AU.addRequired<ScalarEvolutionWrapperPass>(); 1759 AU.addRequired<TargetTransformInfoWrapperPass>(); 1760 AU.addRequired<AAResultsWrapperPass>(); 1761 AU.addRequired<LoopAccessLegacyAnalysis>(); 1762 AU.addRequired<DemandedBitsWrapperPass>(); 1763 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1764 AU.addRequired<InjectTLIMappingsLegacy>(); 1765 1766 // We currently do not preserve loopinfo/dominator analyses with outer loop 1767 // vectorization. Until this is addressed, mark these analyses as preserved 1768 // only for non-VPlan-native path. 1769 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1770 if (!EnableVPlanNativePath) { 1771 AU.addPreserved<LoopInfoWrapperPass>(); 1772 AU.addPreserved<DominatorTreeWrapperPass>(); 1773 } 1774 1775 AU.addPreserved<BasicAAWrapperPass>(); 1776 AU.addPreserved<GlobalsAAWrapperPass>(); 1777 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1778 } 1779 }; 1780 1781 } // end anonymous namespace 1782 1783 //===----------------------------------------------------------------------===// 1784 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1785 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1786 //===----------------------------------------------------------------------===// 1787 1788 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1789 // We need to place the broadcast of invariant variables outside the loop, 1790 // but only if it's proven safe to do so. Else, broadcast will be inside 1791 // vector loop body. 1792 Instruction *Instr = dyn_cast<Instruction>(V); 1793 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1794 (!Instr || 1795 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1796 // Place the code for broadcasting invariant variables in the new preheader. 1797 IRBuilder<>::InsertPointGuard Guard(Builder); 1798 if (SafeToHoist) 1799 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1800 1801 // Broadcast the scalar into all locations in the vector. 1802 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1803 1804 return Shuf; 1805 } 1806 1807 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1808 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1809 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1810 "Expected either an induction phi-node or a truncate of it!"); 1811 Value *Start = II.getStartValue(); 1812 1813 // Construct the initial value of the vector IV in the vector loop preheader 1814 auto CurrIP = Builder.saveIP(); 1815 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1816 if (isa<TruncInst>(EntryVal)) { 1817 assert(Start->getType()->isIntegerTy() && 1818 "Truncation requires an integer type"); 1819 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1820 Step = Builder.CreateTrunc(Step, TruncType); 1821 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1822 } 1823 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1824 Value *SteppedStart = 1825 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1826 1827 // We create vector phi nodes for both integer and floating-point induction 1828 // variables. Here, we determine the kind of arithmetic we will perform. 1829 Instruction::BinaryOps AddOp; 1830 Instruction::BinaryOps MulOp; 1831 if (Step->getType()->isIntegerTy()) { 1832 AddOp = Instruction::Add; 1833 MulOp = Instruction::Mul; 1834 } else { 1835 AddOp = II.getInductionOpcode(); 1836 MulOp = Instruction::FMul; 1837 } 1838 1839 // Multiply the vectorization factor by the step using integer or 1840 // floating-point arithmetic as appropriate. 1841 Value *ConstVF = 1842 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 1843 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1844 1845 // Create a vector splat to use in the induction update. 1846 // 1847 // FIXME: If the step is non-constant, we create the vector splat with 1848 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1849 // handle a constant vector splat. 1850 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1851 Value *SplatVF = isa<Constant>(Mul) 1852 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1853 : Builder.CreateVectorSplat(VF, Mul); 1854 Builder.restoreIP(CurrIP); 1855 1856 // We may need to add the step a number of times, depending on the unroll 1857 // factor. The last of those goes into the PHI. 1858 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1859 &*LoopVectorBody->getFirstInsertionPt()); 1860 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1861 Instruction *LastInduction = VecInd; 1862 for (unsigned Part = 0; Part < UF; ++Part) { 1863 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1864 1865 if (isa<TruncInst>(EntryVal)) 1866 addMetadata(LastInduction, EntryVal); 1867 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1868 1869 LastInduction = cast<Instruction>(addFastMathFlag( 1870 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1871 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1872 } 1873 1874 // Move the last step to the end of the latch block. This ensures consistent 1875 // placement of all induction updates. 1876 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1877 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1878 auto *ICmp = cast<Instruction>(Br->getCondition()); 1879 LastInduction->moveBefore(ICmp); 1880 LastInduction->setName("vec.ind.next"); 1881 1882 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1883 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1884 } 1885 1886 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1887 return Cost->isScalarAfterVectorization(I, VF) || 1888 Cost->isProfitableToScalarize(I, VF); 1889 } 1890 1891 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1892 if (shouldScalarizeInstruction(IV)) 1893 return true; 1894 auto isScalarInst = [&](User *U) -> bool { 1895 auto *I = cast<Instruction>(U); 1896 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1897 }; 1898 return llvm::any_of(IV->users(), isScalarInst); 1899 } 1900 1901 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1902 const InductionDescriptor &ID, const Instruction *EntryVal, 1903 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1904 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1905 "Expected either an induction phi-node or a truncate of it!"); 1906 1907 // This induction variable is not the phi from the original loop but the 1908 // newly-created IV based on the proof that casted Phi is equal to the 1909 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1910 // re-uses the same InductionDescriptor that original IV uses but we don't 1911 // have to do any recording in this case - that is done when original IV is 1912 // processed. 1913 if (isa<TruncInst>(EntryVal)) 1914 return; 1915 1916 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1917 if (Casts.empty()) 1918 return; 1919 // Only the first Cast instruction in the Casts vector is of interest. 1920 // The rest of the Casts (if exist) have no uses outside the 1921 // induction update chain itself. 1922 Instruction *CastInst = *Casts.begin(); 1923 if (Lane < UINT_MAX) 1924 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1925 else 1926 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1927 } 1928 1929 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1930 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1931 "Primary induction variable must have an integer type"); 1932 1933 auto II = Legal->getInductionVars().find(IV); 1934 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1935 1936 auto ID = II->second; 1937 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1938 1939 // The value from the original loop to which we are mapping the new induction 1940 // variable. 1941 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1942 1943 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1944 1945 // Generate code for the induction step. Note that induction steps are 1946 // required to be loop-invariant 1947 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1948 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1949 "Induction step should be loop invariant"); 1950 if (PSE.getSE()->isSCEVable(IV->getType())) { 1951 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1952 return Exp.expandCodeFor(Step, Step->getType(), 1953 LoopVectorPreHeader->getTerminator()); 1954 } 1955 return cast<SCEVUnknown>(Step)->getValue(); 1956 }; 1957 1958 // The scalar value to broadcast. This is derived from the canonical 1959 // induction variable. If a truncation type is given, truncate the canonical 1960 // induction variable and step. Otherwise, derive these values from the 1961 // induction descriptor. 1962 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1963 Value *ScalarIV = Induction; 1964 if (IV != OldInduction) { 1965 ScalarIV = IV->getType()->isIntegerTy() 1966 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1967 : Builder.CreateCast(Instruction::SIToFP, Induction, 1968 IV->getType()); 1969 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1970 ScalarIV->setName("offset.idx"); 1971 } 1972 if (Trunc) { 1973 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1974 assert(Step->getType()->isIntegerTy() && 1975 "Truncation requires an integer step"); 1976 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1977 Step = Builder.CreateTrunc(Step, TruncType); 1978 } 1979 return ScalarIV; 1980 }; 1981 1982 // Create the vector values from the scalar IV, in the absence of creating a 1983 // vector IV. 1984 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1985 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1986 for (unsigned Part = 0; Part < UF; ++Part) { 1987 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1988 Value *EntryPart = 1989 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 1990 ID.getInductionOpcode()); 1991 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1992 if (Trunc) 1993 addMetadata(EntryPart, Trunc); 1994 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1995 } 1996 }; 1997 1998 // Now do the actual transformations, and start with creating the step value. 1999 Value *Step = CreateStepValue(ID.getStep()); 2000 if (VF.isZero() || VF.isScalar()) { 2001 Value *ScalarIV = CreateScalarIV(Step); 2002 CreateSplatIV(ScalarIV, Step); 2003 return; 2004 } 2005 2006 // Determine if we want a scalar version of the induction variable. This is 2007 // true if the induction variable itself is not widened, or if it has at 2008 // least one user in the loop that is not widened. 2009 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2010 if (!NeedsScalarIV) { 2011 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2012 return; 2013 } 2014 2015 // Try to create a new independent vector induction variable. If we can't 2016 // create the phi node, we will splat the scalar induction variable in each 2017 // loop iteration. 2018 if (!shouldScalarizeInstruction(EntryVal)) { 2019 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2020 Value *ScalarIV = CreateScalarIV(Step); 2021 // Create scalar steps that can be used by instructions we will later 2022 // scalarize. Note that the addition of the scalar steps will not increase 2023 // the number of instructions in the loop in the common case prior to 2024 // InstCombine. We will be trading one vector extract for each scalar step. 2025 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2026 return; 2027 } 2028 2029 // All IV users are scalar instructions, so only emit a scalar IV, not a 2030 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2031 // predicate used by the masked loads/stores. 2032 Value *ScalarIV = CreateScalarIV(Step); 2033 if (!Cost->isScalarEpilogueAllowed()) 2034 CreateSplatIV(ScalarIV, Step); 2035 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2036 } 2037 2038 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2039 Instruction::BinaryOps BinOp) { 2040 // Create and check the types. 2041 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2042 int VLen = ValVTy->getNumElements(); 2043 2044 Type *STy = Val->getType()->getScalarType(); 2045 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2046 "Induction Step must be an integer or FP"); 2047 assert(Step->getType() == STy && "Step has wrong type"); 2048 2049 SmallVector<Constant *, 8> Indices; 2050 2051 if (STy->isIntegerTy()) { 2052 // Create a vector of consecutive numbers from zero to VF. 2053 for (int i = 0; i < VLen; ++i) 2054 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2055 2056 // Add the consecutive indices to the vector value. 2057 Constant *Cv = ConstantVector::get(Indices); 2058 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2059 Step = Builder.CreateVectorSplat(VLen, Step); 2060 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2061 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2062 // which can be found from the original scalar operations. 2063 Step = Builder.CreateMul(Cv, Step); 2064 return Builder.CreateAdd(Val, Step, "induction"); 2065 } 2066 2067 // Floating point induction. 2068 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2069 "Binary Opcode should be specified for FP induction"); 2070 // Create a vector of consecutive numbers from zero to VF. 2071 for (int i = 0; i < VLen; ++i) 2072 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2073 2074 // Add the consecutive indices to the vector value. 2075 Constant *Cv = ConstantVector::get(Indices); 2076 2077 Step = Builder.CreateVectorSplat(VLen, Step); 2078 2079 // Floating point operations had to be 'fast' to enable the induction. 2080 FastMathFlags Flags; 2081 Flags.setFast(); 2082 2083 Value *MulOp = Builder.CreateFMul(Cv, Step); 2084 if (isa<Instruction>(MulOp)) 2085 // Have to check, MulOp may be a constant 2086 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2087 2088 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2089 if (isa<Instruction>(BOp)) 2090 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2091 return BOp; 2092 } 2093 2094 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2095 Instruction *EntryVal, 2096 const InductionDescriptor &ID) { 2097 // We shouldn't have to build scalar steps if we aren't vectorizing. 2098 assert(VF.isVector() && "VF should be greater than one"); 2099 assert(!VF.isScalable() && 2100 "the code below assumes a fixed number of elements at compile time"); 2101 // Get the value type and ensure it and the step have the same integer type. 2102 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2103 assert(ScalarIVTy == Step->getType() && 2104 "Val and Step should have the same type"); 2105 2106 // We build scalar steps for both integer and floating-point induction 2107 // variables. Here, we determine the kind of arithmetic we will perform. 2108 Instruction::BinaryOps AddOp; 2109 Instruction::BinaryOps MulOp; 2110 if (ScalarIVTy->isIntegerTy()) { 2111 AddOp = Instruction::Add; 2112 MulOp = Instruction::Mul; 2113 } else { 2114 AddOp = ID.getInductionOpcode(); 2115 MulOp = Instruction::FMul; 2116 } 2117 2118 // Determine the number of scalars we need to generate for each unroll 2119 // iteration. If EntryVal is uniform, we only need to generate the first 2120 // lane. Otherwise, we generate all VF values. 2121 unsigned Lanes = 2122 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2123 ? 1 2124 : VF.getKnownMinValue(); 2125 // Compute the scalar steps and save the results in VectorLoopValueMap. 2126 for (unsigned Part = 0; Part < UF; ++Part) { 2127 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2128 auto *StartIdx = getSignedIntOrFpConstant( 2129 ScalarIVTy, VF.getKnownMinValue() * Part + Lane); 2130 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2131 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2132 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2133 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2134 } 2135 } 2136 } 2137 2138 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2139 assert(V != Induction && "The new induction variable should not be used."); 2140 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2141 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2142 2143 // If we have a stride that is replaced by one, do it here. Defer this for 2144 // the VPlan-native path until we start running Legal checks in that path. 2145 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2146 V = ConstantInt::get(V->getType(), 1); 2147 2148 // If we have a vector mapped to this value, return it. 2149 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2150 return VectorLoopValueMap.getVectorValue(V, Part); 2151 2152 // If the value has not been vectorized, check if it has been scalarized 2153 // instead. If it has been scalarized, and we actually need the value in 2154 // vector form, we will construct the vector values on demand. 2155 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2156 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2157 2158 // If we've scalarized a value, that value should be an instruction. 2159 auto *I = cast<Instruction>(V); 2160 2161 // If we aren't vectorizing, we can just copy the scalar map values over to 2162 // the vector map. 2163 if (VF == 1) { 2164 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2165 return ScalarValue; 2166 } 2167 2168 // Get the last scalar instruction we generated for V and Part. If the value 2169 // is known to be uniform after vectorization, this corresponds to lane zero 2170 // of the Part unroll iteration. Otherwise, the last instruction is the one 2171 // we created for the last vector lane of the Part unroll iteration. 2172 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2173 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2174 ? 0 2175 : VF.getKnownMinValue() - 1; 2176 auto *LastInst = cast<Instruction>( 2177 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2178 2179 // Set the insert point after the last scalarized instruction. This ensures 2180 // the insertelement sequence will directly follow the scalar definitions. 2181 auto OldIP = Builder.saveIP(); 2182 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2183 Builder.SetInsertPoint(&*NewIP); 2184 2185 // However, if we are vectorizing, we need to construct the vector values. 2186 // If the value is known to be uniform after vectorization, we can just 2187 // broadcast the scalar value corresponding to lane zero for each unroll 2188 // iteration. Otherwise, we construct the vector values using insertelement 2189 // instructions. Since the resulting vectors are stored in 2190 // VectorLoopValueMap, we will only generate the insertelements once. 2191 Value *VectorValue = nullptr; 2192 if (Cost->isUniformAfterVectorization(I, VF)) { 2193 VectorValue = getBroadcastInstrs(ScalarValue); 2194 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2195 } else { 2196 // Initialize packing with insertelements to start from undef. 2197 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2198 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2199 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2200 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2201 packScalarIntoVectorValue(V, {Part, Lane}); 2202 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2203 } 2204 Builder.restoreIP(OldIP); 2205 return VectorValue; 2206 } 2207 2208 // If this scalar is unknown, assume that it is a constant or that it is 2209 // loop invariant. Broadcast V and save the value for future uses. 2210 Value *B = getBroadcastInstrs(V); 2211 VectorLoopValueMap.setVectorValue(V, Part, B); 2212 return B; 2213 } 2214 2215 Value * 2216 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2217 const VPIteration &Instance) { 2218 // If the value is not an instruction contained in the loop, it should 2219 // already be scalar. 2220 if (OrigLoop->isLoopInvariant(V)) 2221 return V; 2222 2223 assert(Instance.Lane > 0 2224 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2225 : true && "Uniform values only have lane zero"); 2226 2227 // If the value from the original loop has not been vectorized, it is 2228 // represented by UF x VF scalar values in the new loop. Return the requested 2229 // scalar value. 2230 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2231 return VectorLoopValueMap.getScalarValue(V, Instance); 2232 2233 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2234 // for the given unroll part. If this entry is not a vector type (i.e., the 2235 // vectorization factor is one), there is no need to generate an 2236 // extractelement instruction. 2237 auto *U = getOrCreateVectorValue(V, Instance.Part); 2238 if (!U->getType()->isVectorTy()) { 2239 assert(VF == 1 && "Value not scalarized has non-vector type"); 2240 return U; 2241 } 2242 2243 // Otherwise, the value from the original loop has been vectorized and is 2244 // represented by UF vector values. Extract and return the requested scalar 2245 // value from the appropriate vector lane. 2246 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2247 } 2248 2249 void InnerLoopVectorizer::packScalarIntoVectorValue( 2250 Value *V, const VPIteration &Instance) { 2251 assert(V != Induction && "The new induction variable should not be used."); 2252 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2253 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2254 2255 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2256 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2257 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2258 Builder.getInt32(Instance.Lane)); 2259 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2260 } 2261 2262 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2263 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2264 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2265 SmallVector<int, 8> ShuffleMask; 2266 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2267 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2268 2269 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2270 ShuffleMask, "reverse"); 2271 } 2272 2273 // Return whether we allow using masked interleave-groups (for dealing with 2274 // strided loads/stores that reside in predicated blocks, or for dealing 2275 // with gaps). 2276 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2277 // If an override option has been passed in for interleaved accesses, use it. 2278 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2279 return EnableMaskedInterleavedMemAccesses; 2280 2281 return TTI.enableMaskedInterleavedAccessVectorization(); 2282 } 2283 2284 // Try to vectorize the interleave group that \p Instr belongs to. 2285 // 2286 // E.g. Translate following interleaved load group (factor = 3): 2287 // for (i = 0; i < N; i+=3) { 2288 // R = Pic[i]; // Member of index 0 2289 // G = Pic[i+1]; // Member of index 1 2290 // B = Pic[i+2]; // Member of index 2 2291 // ... // do something to R, G, B 2292 // } 2293 // To: 2294 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2295 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2296 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2297 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2298 // 2299 // Or translate following interleaved store group (factor = 3): 2300 // for (i = 0; i < N; i+=3) { 2301 // ... do something to R, G, B 2302 // Pic[i] = R; // Member of index 0 2303 // Pic[i+1] = G; // Member of index 1 2304 // Pic[i+2] = B; // Member of index 2 2305 // } 2306 // To: 2307 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2308 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2309 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2310 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2311 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2312 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2313 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2314 VPValue *Addr, VPValue *BlockInMask) { 2315 Instruction *Instr = Group->getInsertPos(); 2316 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2317 2318 // Prepare for the vector type of the interleaved load/store. 2319 Type *ScalarTy = getMemInstValueType(Instr); 2320 unsigned InterleaveFactor = Group->getFactor(); 2321 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2322 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2323 2324 // Prepare for the new pointers. 2325 SmallVector<Value *, 2> AddrParts; 2326 unsigned Index = Group->getIndex(Instr); 2327 2328 // TODO: extend the masked interleaved-group support to reversed access. 2329 assert((!BlockInMask || !Group->isReverse()) && 2330 "Reversed masked interleave-group not supported."); 2331 2332 // If the group is reverse, adjust the index to refer to the last vector lane 2333 // instead of the first. We adjust the index from the first vector lane, 2334 // rather than directly getting the pointer for lane VF - 1, because the 2335 // pointer operand of the interleaved access is supposed to be uniform. For 2336 // uniform instructions, we're only required to generate a value for the 2337 // first vector lane in each unroll iteration. 2338 assert(!VF.isScalable() && 2339 "scalable vector reverse operation is not implemented"); 2340 if (Group->isReverse()) 2341 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2342 2343 for (unsigned Part = 0; Part < UF; Part++) { 2344 Value *AddrPart = State.get(Addr, {Part, 0}); 2345 setDebugLocFromInst(Builder, AddrPart); 2346 2347 // Notice current instruction could be any index. Need to adjust the address 2348 // to the member of index 0. 2349 // 2350 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2351 // b = A[i]; // Member of index 0 2352 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2353 // 2354 // E.g. A[i+1] = a; // Member of index 1 2355 // A[i] = b; // Member of index 0 2356 // A[i+2] = c; // Member of index 2 (Current instruction) 2357 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2358 2359 bool InBounds = false; 2360 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2361 InBounds = gep->isInBounds(); 2362 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2363 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2364 2365 // Cast to the vector pointer type. 2366 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2367 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2368 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2369 } 2370 2371 setDebugLocFromInst(Builder, Instr); 2372 Value *UndefVec = UndefValue::get(VecTy); 2373 2374 Value *MaskForGaps = nullptr; 2375 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2376 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2377 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2378 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2379 } 2380 2381 // Vectorize the interleaved load group. 2382 if (isa<LoadInst>(Instr)) { 2383 // For each unroll part, create a wide load for the group. 2384 SmallVector<Value *, 2> NewLoads; 2385 for (unsigned Part = 0; Part < UF; Part++) { 2386 Instruction *NewLoad; 2387 if (BlockInMask || MaskForGaps) { 2388 assert(useMaskedInterleavedAccesses(*TTI) && 2389 "masked interleaved groups are not allowed."); 2390 Value *GroupMask = MaskForGaps; 2391 if (BlockInMask) { 2392 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2393 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2394 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2395 Value *ShuffledMask = Builder.CreateShuffleVector( 2396 BlockInMaskPart, Undefs, 2397 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2398 "interleaved.mask"); 2399 GroupMask = MaskForGaps 2400 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2401 MaskForGaps) 2402 : ShuffledMask; 2403 } 2404 NewLoad = 2405 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2406 GroupMask, UndefVec, "wide.masked.vec"); 2407 } 2408 else 2409 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2410 Group->getAlign(), "wide.vec"); 2411 Group->addMetadata(NewLoad); 2412 NewLoads.push_back(NewLoad); 2413 } 2414 2415 // For each member in the group, shuffle out the appropriate data from the 2416 // wide loads. 2417 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2418 Instruction *Member = Group->getMember(I); 2419 2420 // Skip the gaps in the group. 2421 if (!Member) 2422 continue; 2423 2424 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2425 auto StrideMask = 2426 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2427 for (unsigned Part = 0; Part < UF; Part++) { 2428 Value *StridedVec = Builder.CreateShuffleVector( 2429 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2430 2431 // If this member has different type, cast the result type. 2432 if (Member->getType() != ScalarTy) { 2433 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2434 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2435 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2436 } 2437 2438 if (Group->isReverse()) 2439 StridedVec = reverseVector(StridedVec); 2440 2441 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2442 } 2443 } 2444 return; 2445 } 2446 2447 // The sub vector type for current instruction. 2448 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2449 auto *SubVT = VectorType::get(ScalarTy, VF); 2450 2451 // Vectorize the interleaved store group. 2452 for (unsigned Part = 0; Part < UF; Part++) { 2453 // Collect the stored vector from each member. 2454 SmallVector<Value *, 4> StoredVecs; 2455 for (unsigned i = 0; i < InterleaveFactor; i++) { 2456 // Interleaved store group doesn't allow a gap, so each index has a member 2457 Instruction *Member = Group->getMember(i); 2458 assert(Member && "Fail to get a member from an interleaved store group"); 2459 2460 Value *StoredVec = getOrCreateVectorValue( 2461 cast<StoreInst>(Member)->getValueOperand(), Part); 2462 if (Group->isReverse()) 2463 StoredVec = reverseVector(StoredVec); 2464 2465 // If this member has different type, cast it to a unified type. 2466 2467 if (StoredVec->getType() != SubVT) 2468 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2469 2470 StoredVecs.push_back(StoredVec); 2471 } 2472 2473 // Concatenate all vectors into a wide vector. 2474 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2475 2476 // Interleave the elements in the wide vector. 2477 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2478 Value *IVec = Builder.CreateShuffleVector( 2479 WideVec, UndefVec, 2480 createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2481 "interleaved.vec"); 2482 2483 Instruction *NewStoreInstr; 2484 if (BlockInMask) { 2485 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2486 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2487 Value *ShuffledMask = Builder.CreateShuffleVector( 2488 BlockInMaskPart, Undefs, 2489 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2490 "interleaved.mask"); 2491 NewStoreInstr = Builder.CreateMaskedStore( 2492 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2493 } 2494 else 2495 NewStoreInstr = 2496 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2497 2498 Group->addMetadata(NewStoreInstr); 2499 } 2500 } 2501 2502 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2503 VPTransformState &State, 2504 VPValue *Addr, 2505 VPValue *StoredValue, 2506 VPValue *BlockInMask) { 2507 // Attempt to issue a wide load. 2508 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2509 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2510 2511 assert((LI || SI) && "Invalid Load/Store instruction"); 2512 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2513 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2514 2515 LoopVectorizationCostModel::InstWidening Decision = 2516 Cost->getWideningDecision(Instr, VF); 2517 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2518 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2519 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2520 "CM decision is not to widen the memory instruction"); 2521 2522 Type *ScalarDataTy = getMemInstValueType(Instr); 2523 2524 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2525 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2526 const Align Alignment = getLoadStoreAlignment(Instr); 2527 2528 // Determine if the pointer operand of the access is either consecutive or 2529 // reverse consecutive. 2530 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2531 bool ConsecutiveStride = 2532 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2533 bool CreateGatherScatter = 2534 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2535 2536 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2537 // gather/scatter. Otherwise Decision should have been to Scalarize. 2538 assert((ConsecutiveStride || CreateGatherScatter) && 2539 "The instruction should be scalarized"); 2540 (void)ConsecutiveStride; 2541 2542 VectorParts BlockInMaskParts(UF); 2543 bool isMaskRequired = BlockInMask; 2544 if (isMaskRequired) 2545 for (unsigned Part = 0; Part < UF; ++Part) 2546 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2547 2548 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2549 // Calculate the pointer for the specific unroll-part. 2550 GetElementPtrInst *PartPtr = nullptr; 2551 2552 bool InBounds = false; 2553 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2554 InBounds = gep->isInBounds(); 2555 2556 if (Reverse) { 2557 // If the address is consecutive but reversed, then the 2558 // wide store needs to start at the last vector element. 2559 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2560 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2561 PartPtr->setIsInBounds(InBounds); 2562 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2563 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2564 PartPtr->setIsInBounds(InBounds); 2565 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2566 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2567 } else { 2568 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2569 ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); 2570 PartPtr->setIsInBounds(InBounds); 2571 } 2572 2573 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2574 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2575 }; 2576 2577 // Handle Stores: 2578 if (SI) { 2579 setDebugLocFromInst(Builder, SI); 2580 2581 for (unsigned Part = 0; Part < UF; ++Part) { 2582 Instruction *NewSI = nullptr; 2583 Value *StoredVal = State.get(StoredValue, Part); 2584 if (CreateGatherScatter) { 2585 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2586 Value *VectorGep = State.get(Addr, Part); 2587 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2588 MaskPart); 2589 } else { 2590 if (Reverse) { 2591 // If we store to reverse consecutive memory locations, then we need 2592 // to reverse the order of elements in the stored value. 2593 StoredVal = reverseVector(StoredVal); 2594 // We don't want to update the value in the map as it might be used in 2595 // another expression. So don't call resetVectorValue(StoredVal). 2596 } 2597 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2598 if (isMaskRequired) 2599 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2600 BlockInMaskParts[Part]); 2601 else 2602 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2603 } 2604 addMetadata(NewSI, SI); 2605 } 2606 return; 2607 } 2608 2609 // Handle loads. 2610 assert(LI && "Must have a load instruction"); 2611 setDebugLocFromInst(Builder, LI); 2612 for (unsigned Part = 0; Part < UF; ++Part) { 2613 Value *NewLI; 2614 if (CreateGatherScatter) { 2615 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2616 Value *VectorGep = State.get(Addr, Part); 2617 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2618 nullptr, "wide.masked.gather"); 2619 addMetadata(NewLI, LI); 2620 } else { 2621 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2622 if (isMaskRequired) 2623 NewLI = Builder.CreateMaskedLoad( 2624 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2625 "wide.masked.load"); 2626 else 2627 NewLI = 2628 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2629 2630 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2631 addMetadata(NewLI, LI); 2632 if (Reverse) 2633 NewLI = reverseVector(NewLI); 2634 } 2635 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2636 } 2637 } 2638 2639 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2640 const VPIteration &Instance, 2641 bool IfPredicateInstr, 2642 VPTransformState &State) { 2643 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2644 2645 setDebugLocFromInst(Builder, Instr); 2646 2647 // Does this instruction return a value ? 2648 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2649 2650 Instruction *Cloned = Instr->clone(); 2651 if (!IsVoidRetTy) 2652 Cloned->setName(Instr->getName() + ".cloned"); 2653 2654 // Replace the operands of the cloned instructions with their scalar 2655 // equivalents in the new loop. 2656 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2657 auto *NewOp = State.get(User.getOperand(op), Instance); 2658 Cloned->setOperand(op, NewOp); 2659 } 2660 addNewMetadata(Cloned, Instr); 2661 2662 // Place the cloned scalar in the new loop. 2663 Builder.Insert(Cloned); 2664 2665 // Add the cloned scalar to the scalar map entry. 2666 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2667 2668 // If we just cloned a new assumption, add it the assumption cache. 2669 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2670 if (II->getIntrinsicID() == Intrinsic::assume) 2671 AC->registerAssumption(II); 2672 2673 // End if-block. 2674 if (IfPredicateInstr) 2675 PredicatedInstructions.push_back(Cloned); 2676 } 2677 2678 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2679 Value *End, Value *Step, 2680 Instruction *DL) { 2681 BasicBlock *Header = L->getHeader(); 2682 BasicBlock *Latch = L->getLoopLatch(); 2683 // As we're just creating this loop, it's possible no latch exists 2684 // yet. If so, use the header as this will be a single block loop. 2685 if (!Latch) 2686 Latch = Header; 2687 2688 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2689 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2690 setDebugLocFromInst(Builder, OldInst); 2691 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2692 2693 Builder.SetInsertPoint(Latch->getTerminator()); 2694 setDebugLocFromInst(Builder, OldInst); 2695 2696 // Create i+1 and fill the PHINode. 2697 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2698 Induction->addIncoming(Start, L->getLoopPreheader()); 2699 Induction->addIncoming(Next, Latch); 2700 // Create the compare. 2701 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2702 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2703 2704 // Now we have two terminators. Remove the old one from the block. 2705 Latch->getTerminator()->eraseFromParent(); 2706 2707 return Induction; 2708 } 2709 2710 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2711 if (TripCount) 2712 return TripCount; 2713 2714 assert(L && "Create Trip Count for null loop."); 2715 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2716 // Find the loop boundaries. 2717 ScalarEvolution *SE = PSE.getSE(); 2718 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2719 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2720 "Invalid loop count"); 2721 2722 Type *IdxTy = Legal->getWidestInductionType(); 2723 assert(IdxTy && "No type for induction"); 2724 2725 // The exit count might have the type of i64 while the phi is i32. This can 2726 // happen if we have an induction variable that is sign extended before the 2727 // compare. The only way that we get a backedge taken count is that the 2728 // induction variable was signed and as such will not overflow. In such a case 2729 // truncation is legal. 2730 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2731 IdxTy->getPrimitiveSizeInBits()) 2732 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2733 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2734 2735 // Get the total trip count from the count by adding 1. 2736 const SCEV *ExitCount = SE->getAddExpr( 2737 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2738 2739 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2740 2741 // Expand the trip count and place the new instructions in the preheader. 2742 // Notice that the pre-header does not change, only the loop body. 2743 SCEVExpander Exp(*SE, DL, "induction"); 2744 2745 // Count holds the overall loop count (N). 2746 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2747 L->getLoopPreheader()->getTerminator()); 2748 2749 if (TripCount->getType()->isPointerTy()) 2750 TripCount = 2751 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2752 L->getLoopPreheader()->getTerminator()); 2753 2754 return TripCount; 2755 } 2756 2757 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2758 if (VectorTripCount) 2759 return VectorTripCount; 2760 2761 Value *TC = getOrCreateTripCount(L); 2762 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2763 2764 Type *Ty = TC->getType(); 2765 // This is where we can make the step a runtime constant. 2766 assert(!VF.isScalable() && "scalable vectorization is not supported yet"); 2767 Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); 2768 2769 // If the tail is to be folded by masking, round the number of iterations N 2770 // up to a multiple of Step instead of rounding down. This is done by first 2771 // adding Step-1 and then rounding down. Note that it's ok if this addition 2772 // overflows: the vector induction variable will eventually wrap to zero given 2773 // that it starts at zero and its Step is a power of two; the loop will then 2774 // exit, with the last early-exit vector comparison also producing all-true. 2775 if (Cost->foldTailByMasking()) { 2776 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2777 "VF*UF must be a power of 2 when folding tail by masking"); 2778 TC = Builder.CreateAdd( 2779 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2780 } 2781 2782 // Now we need to generate the expression for the part of the loop that the 2783 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2784 // iterations are not required for correctness, or N - Step, otherwise. Step 2785 // is equal to the vectorization factor (number of SIMD elements) times the 2786 // unroll factor (number of SIMD instructions). 2787 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2788 2789 // If there is a non-reversed interleaved group that may speculatively access 2790 // memory out-of-bounds, we need to ensure that there will be at least one 2791 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2792 // the trip count, we set the remainder to be equal to the step. If the step 2793 // does not evenly divide the trip count, no adjustment is necessary since 2794 // there will already be scalar iterations. Note that the minimum iterations 2795 // check ensures that N >= Step. 2796 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 2797 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2798 R = Builder.CreateSelect(IsZero, Step, R); 2799 } 2800 2801 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2802 2803 return VectorTripCount; 2804 } 2805 2806 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2807 const DataLayout &DL) { 2808 // Verify that V is a vector type with same number of elements as DstVTy. 2809 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2810 unsigned VF = DstFVTy->getNumElements(); 2811 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2812 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2813 Type *SrcElemTy = SrcVecTy->getElementType(); 2814 Type *DstElemTy = DstFVTy->getElementType(); 2815 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2816 "Vector elements must have same size"); 2817 2818 // Do a direct cast if element types are castable. 2819 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2820 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2821 } 2822 // V cannot be directly casted to desired vector type. 2823 // May happen when V is a floating point vector but DstVTy is a vector of 2824 // pointers or vice-versa. Handle this using a two-step bitcast using an 2825 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2826 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2827 "Only one type should be a pointer type"); 2828 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2829 "Only one type should be a floating point type"); 2830 Type *IntTy = 2831 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2832 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2833 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2834 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2835 } 2836 2837 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2838 BasicBlock *Bypass) { 2839 Value *Count = getOrCreateTripCount(L); 2840 // Reuse existing vector loop preheader for TC checks. 2841 // Note that new preheader block is generated for vector loop. 2842 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2843 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2844 2845 // Generate code to check if the loop's trip count is less than VF * UF, or 2846 // equal to it in case a scalar epilogue is required; this implies that the 2847 // vector trip count is zero. This check also covers the case where adding one 2848 // to the backedge-taken count overflowed leading to an incorrect trip count 2849 // of zero. In this case we will also jump to the scalar loop. 2850 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2851 : ICmpInst::ICMP_ULT; 2852 2853 // If tail is to be folded, vector loop takes care of all iterations. 2854 Value *CheckMinIters = Builder.getFalse(); 2855 if (!Cost->foldTailByMasking()) { 2856 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2857 CheckMinIters = Builder.CreateICmp( 2858 P, Count, 2859 ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), 2860 "min.iters.check"); 2861 } 2862 // Create new preheader for vector loop. 2863 LoopVectorPreHeader = 2864 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2865 "vector.ph"); 2866 2867 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2868 DT->getNode(Bypass)->getIDom()) && 2869 "TC check is expected to dominate Bypass"); 2870 2871 // Update dominator for Bypass & LoopExit. 2872 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2873 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2874 2875 ReplaceInstWithInst( 2876 TCCheckBlock->getTerminator(), 2877 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2878 LoopBypassBlocks.push_back(TCCheckBlock); 2879 } 2880 2881 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2882 // Reuse existing vector loop preheader for SCEV checks. 2883 // Note that new preheader block is generated for vector loop. 2884 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2885 2886 // Generate the code to check that the SCEV assumptions that we made. 2887 // We want the new basic block to start at the first instruction in a 2888 // sequence of instructions that form a check. 2889 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2890 "scev.check"); 2891 Value *SCEVCheck = Exp.expandCodeForPredicate( 2892 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2893 2894 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2895 if (C->isZero()) 2896 return; 2897 2898 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2899 (OptForSizeBasedOnProfile && 2900 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2901 "Cannot SCEV check stride or overflow when optimizing for size"); 2902 2903 SCEVCheckBlock->setName("vector.scevcheck"); 2904 // Create new preheader for vector loop. 2905 LoopVectorPreHeader = 2906 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2907 nullptr, "vector.ph"); 2908 2909 // Update dominator only if this is first RT check. 2910 if (LoopBypassBlocks.empty()) { 2911 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2912 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2913 } 2914 2915 ReplaceInstWithInst( 2916 SCEVCheckBlock->getTerminator(), 2917 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2918 LoopBypassBlocks.push_back(SCEVCheckBlock); 2919 AddedSafetyChecks = true; 2920 } 2921 2922 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2923 // VPlan-native path does not do any analysis for runtime checks currently. 2924 if (EnableVPlanNativePath) 2925 return; 2926 2927 // Reuse existing vector loop preheader for runtime memory checks. 2928 // Note that new preheader block is generated for vector loop. 2929 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2930 2931 // Generate the code that checks in runtime if arrays overlap. We put the 2932 // checks into a separate block to make the more common case of few elements 2933 // faster. 2934 auto *LAI = Legal->getLAI(); 2935 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 2936 if (!RtPtrChecking.Need) 2937 return; 2938 Instruction *FirstCheckInst; 2939 Instruction *MemRuntimeCheck; 2940 std::tie(FirstCheckInst, MemRuntimeCheck) = 2941 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 2942 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 2943 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 2944 "claimed checks are required"); 2945 2946 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2947 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2948 "Cannot emit memory checks when optimizing for size, unless forced " 2949 "to vectorize."); 2950 ORE->emit([&]() { 2951 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2952 L->getStartLoc(), L->getHeader()) 2953 << "Code-size may be reduced by not forcing " 2954 "vectorization, or by source-code modifications " 2955 "eliminating the need for runtime checks " 2956 "(e.g., adding 'restrict')."; 2957 }); 2958 } 2959 2960 MemCheckBlock->setName("vector.memcheck"); 2961 // Create new preheader for vector loop. 2962 LoopVectorPreHeader = 2963 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2964 "vector.ph"); 2965 2966 // Update dominator only if this is first RT check. 2967 if (LoopBypassBlocks.empty()) { 2968 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2969 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2970 } 2971 2972 ReplaceInstWithInst( 2973 MemCheckBlock->getTerminator(), 2974 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2975 LoopBypassBlocks.push_back(MemCheckBlock); 2976 AddedSafetyChecks = true; 2977 2978 // We currently don't use LoopVersioning for the actual loop cloning but we 2979 // still use it to add the noalias metadata. 2980 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2981 PSE.getSE()); 2982 LVer->prepareNoAliasMetadata(); 2983 } 2984 2985 Value *InnerLoopVectorizer::emitTransformedIndex( 2986 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2987 const InductionDescriptor &ID) const { 2988 2989 SCEVExpander Exp(*SE, DL, "induction"); 2990 auto Step = ID.getStep(); 2991 auto StartValue = ID.getStartValue(); 2992 assert(Index->getType() == Step->getType() && 2993 "Index type does not match StepValue type"); 2994 2995 // Note: the IR at this point is broken. We cannot use SE to create any new 2996 // SCEV and then expand it, hoping that SCEV's simplification will give us 2997 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2998 // lead to various SCEV crashes. So all we can do is to use builder and rely 2999 // on InstCombine for future simplifications. Here we handle some trivial 3000 // cases only. 3001 auto CreateAdd = [&B](Value *X, Value *Y) { 3002 assert(X->getType() == Y->getType() && "Types don't match!"); 3003 if (auto *CX = dyn_cast<ConstantInt>(X)) 3004 if (CX->isZero()) 3005 return Y; 3006 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3007 if (CY->isZero()) 3008 return X; 3009 return B.CreateAdd(X, Y); 3010 }; 3011 3012 auto CreateMul = [&B](Value *X, Value *Y) { 3013 assert(X->getType() == Y->getType() && "Types don't match!"); 3014 if (auto *CX = dyn_cast<ConstantInt>(X)) 3015 if (CX->isOne()) 3016 return Y; 3017 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3018 if (CY->isOne()) 3019 return X; 3020 return B.CreateMul(X, Y); 3021 }; 3022 3023 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3024 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3025 // the DomTree is not kept up-to-date for additional blocks generated in the 3026 // vector loop. By using the header as insertion point, we guarantee that the 3027 // expanded instructions dominate all their uses. 3028 auto GetInsertPoint = [this, &B]() { 3029 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3030 if (InsertBB != LoopVectorBody && 3031 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3032 return LoopVectorBody->getTerminator(); 3033 return &*B.GetInsertPoint(); 3034 }; 3035 switch (ID.getKind()) { 3036 case InductionDescriptor::IK_IntInduction: { 3037 assert(Index->getType() == StartValue->getType() && 3038 "Index type does not match StartValue type"); 3039 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3040 return B.CreateSub(StartValue, Index); 3041 auto *Offset = CreateMul( 3042 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3043 return CreateAdd(StartValue, Offset); 3044 } 3045 case InductionDescriptor::IK_PtrInduction: { 3046 assert(isa<SCEVConstant>(Step) && 3047 "Expected constant step for pointer induction"); 3048 return B.CreateGEP( 3049 StartValue->getType()->getPointerElementType(), StartValue, 3050 CreateMul(Index, 3051 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3052 } 3053 case InductionDescriptor::IK_FpInduction: { 3054 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3055 auto InductionBinOp = ID.getInductionBinOp(); 3056 assert(InductionBinOp && 3057 (InductionBinOp->getOpcode() == Instruction::FAdd || 3058 InductionBinOp->getOpcode() == Instruction::FSub) && 3059 "Original bin op should be defined for FP induction"); 3060 3061 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3062 3063 // Floating point operations had to be 'fast' to enable the induction. 3064 FastMathFlags Flags; 3065 Flags.setFast(); 3066 3067 Value *MulExp = B.CreateFMul(StepValue, Index); 3068 if (isa<Instruction>(MulExp)) 3069 // We have to check, the MulExp may be a constant. 3070 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3071 3072 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3073 "induction"); 3074 if (isa<Instruction>(BOp)) 3075 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3076 3077 return BOp; 3078 } 3079 case InductionDescriptor::IK_NoInduction: 3080 return nullptr; 3081 } 3082 llvm_unreachable("invalid enum"); 3083 } 3084 3085 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3086 LoopScalarBody = OrigLoop->getHeader(); 3087 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3088 LoopExitBlock = OrigLoop->getExitBlock(); 3089 assert(LoopExitBlock && "Must have an exit block"); 3090 assert(LoopVectorPreHeader && "Invalid loop structure"); 3091 3092 LoopMiddleBlock = 3093 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3094 LI, nullptr, Twine(Prefix) + "middle.block"); 3095 LoopScalarPreHeader = 3096 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3097 nullptr, Twine(Prefix) + "scalar.ph"); 3098 // We intentionally don't let SplitBlock to update LoopInfo since 3099 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3100 // LoopVectorBody is explicitly added to the correct place few lines later. 3101 LoopVectorBody = 3102 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3103 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3104 3105 // Update dominator for loop exit. 3106 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3107 3108 // Create and register the new vector loop. 3109 Loop *Lp = LI->AllocateLoop(); 3110 Loop *ParentLoop = OrigLoop->getParentLoop(); 3111 3112 // Insert the new loop into the loop nest and register the new basic blocks 3113 // before calling any utilities such as SCEV that require valid LoopInfo. 3114 if (ParentLoop) { 3115 ParentLoop->addChildLoop(Lp); 3116 } else { 3117 LI->addTopLevelLoop(Lp); 3118 } 3119 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3120 return Lp; 3121 } 3122 3123 void InnerLoopVectorizer::createInductionResumeValues(Loop *L, 3124 Value *VectorTripCount) { 3125 assert(VectorTripCount && L && "Expected valid arguments"); 3126 // We are going to resume the execution of the scalar loop. 3127 // Go over all of the induction variables that we found and fix the 3128 // PHIs that are left in the scalar version of the loop. 3129 // The starting values of PHI nodes depend on the counter of the last 3130 // iteration in the vectorized loop. 3131 // If we come from a bypass edge then we need to start from the original 3132 // start value. 3133 for (auto &InductionEntry : Legal->getInductionVars()) { 3134 PHINode *OrigPhi = InductionEntry.first; 3135 InductionDescriptor II = InductionEntry.second; 3136 3137 // Create phi nodes to merge from the backedge-taken check block. 3138 PHINode *BCResumeVal = 3139 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3140 LoopScalarPreHeader->getTerminator()); 3141 // Copy original phi DL over to the new one. 3142 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3143 Value *&EndValue = IVEndValues[OrigPhi]; 3144 if (OrigPhi == OldInduction) { 3145 // We know what the end value is. 3146 EndValue = VectorTripCount; 3147 } else { 3148 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3149 Type *StepType = II.getStep()->getType(); 3150 Instruction::CastOps CastOp = 3151 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3152 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3153 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3154 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3155 EndValue->setName("ind.end"); 3156 } 3157 3158 // The new PHI merges the original incoming value, in case of a bypass, 3159 // or the value at the end of the vectorized loop. 3160 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3161 3162 // Fix the scalar body counter (PHI node). 3163 // The old induction's phi node in the scalar body needs the truncated 3164 // value. 3165 for (BasicBlock *BB : LoopBypassBlocks) 3166 BCResumeVal->addIncoming(II.getStartValue(), BB); 3167 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3168 } 3169 } 3170 3171 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3172 MDNode *OrigLoopID) { 3173 assert(L && "Expected valid loop."); 3174 3175 // The trip counts should be cached by now. 3176 Value *Count = getOrCreateTripCount(L); 3177 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3178 3179 // We need the OrigLoop (scalar loop part) latch terminator to help 3180 // produce correct debug info for the middle block BB instructions. 3181 // The legality check stage guarantees that the loop will have a single 3182 // latch. 3183 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3184 "Scalar loop latch terminator isn't a branch"); 3185 BranchInst *ScalarLatchBr = 3186 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3187 3188 // Add a check in the middle block to see if we have completed 3189 // all of the iterations in the first vector loop. 3190 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3191 // If tail is to be folded, we know we don't need to run the remainder. 3192 Value *CmpN = Builder.getTrue(); 3193 if (!Cost->foldTailByMasking()) { 3194 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3195 VectorTripCount, "cmp.n", 3196 LoopMiddleBlock->getTerminator()); 3197 3198 // Here we use the same DebugLoc as the scalar loop latch branch instead 3199 // of the corresponding compare because they may have ended up with 3200 // different line numbers and we want to avoid awkward line stepping while 3201 // debugging. Eg. if the compare has got a line number inside the loop. 3202 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3203 } 3204 3205 BranchInst *BrInst = 3206 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3207 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3208 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3209 3210 // Get ready to start creating new instructions into the vectorized body. 3211 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3212 "Inconsistent vector loop preheader"); 3213 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3214 3215 Optional<MDNode *> VectorizedLoopID = 3216 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3217 LLVMLoopVectorizeFollowupVectorized}); 3218 if (VectorizedLoopID.hasValue()) { 3219 L->setLoopID(VectorizedLoopID.getValue()); 3220 3221 // Do not setAlreadyVectorized if loop attributes have been defined 3222 // explicitly. 3223 return LoopVectorPreHeader; 3224 } 3225 3226 // Keep all loop hints from the original loop on the vector loop (we'll 3227 // replace the vectorizer-specific hints below). 3228 if (MDNode *LID = OrigLoop->getLoopID()) 3229 L->setLoopID(LID); 3230 3231 LoopVectorizeHints Hints(L, true, *ORE); 3232 Hints.setAlreadyVectorized(); 3233 3234 #ifdef EXPENSIVE_CHECKS 3235 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3236 LI->verify(*DT); 3237 #endif 3238 3239 return LoopVectorPreHeader; 3240 } 3241 3242 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3243 /* 3244 In this function we generate a new loop. The new loop will contain 3245 the vectorized instructions while the old loop will continue to run the 3246 scalar remainder. 3247 3248 [ ] <-- loop iteration number check. 3249 / | 3250 / v 3251 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3252 | / | 3253 | / v 3254 || [ ] <-- vector pre header. 3255 |/ | 3256 | v 3257 | [ ] \ 3258 | [ ]_| <-- vector loop. 3259 | | 3260 | v 3261 | -[ ] <--- middle-block. 3262 | / | 3263 | / v 3264 -|- >[ ] <--- new preheader. 3265 | | 3266 | v 3267 | [ ] \ 3268 | [ ]_| <-- old scalar loop to handle remainder. 3269 \ | 3270 \ v 3271 >[ ] <-- exit block. 3272 ... 3273 */ 3274 3275 // Get the metadata of the original loop before it gets modified. 3276 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3277 3278 // Create an empty vector loop, and prepare basic blocks for the runtime 3279 // checks. 3280 Loop *Lp = createVectorLoopSkeleton(""); 3281 3282 // Now, compare the new count to zero. If it is zero skip the vector loop and 3283 // jump to the scalar loop. This check also covers the case where the 3284 // backedge-taken count is uint##_max: adding one to it will overflow leading 3285 // to an incorrect trip count of zero. In this (rare) case we will also jump 3286 // to the scalar loop. 3287 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3288 3289 // Generate the code to check any assumptions that we've made for SCEV 3290 // expressions. 3291 emitSCEVChecks(Lp, LoopScalarPreHeader); 3292 3293 // Generate the code that checks in runtime if arrays overlap. We put the 3294 // checks into a separate block to make the more common case of few elements 3295 // faster. 3296 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3297 3298 // Some loops have a single integer induction variable, while other loops 3299 // don't. One example is c++ iterators that often have multiple pointer 3300 // induction variables. In the code below we also support a case where we 3301 // don't have a single induction variable. 3302 // 3303 // We try to obtain an induction variable from the original loop as hard 3304 // as possible. However if we don't find one that: 3305 // - is an integer 3306 // - counts from zero, stepping by one 3307 // - is the size of the widest induction variable type 3308 // then we create a new one. 3309 OldInduction = Legal->getPrimaryInduction(); 3310 Type *IdxTy = Legal->getWidestInductionType(); 3311 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3312 // The loop step is equal to the vectorization factor (num of SIMD elements) 3313 // times the unroll factor (num of SIMD instructions). 3314 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3315 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 3316 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3317 Induction = 3318 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3319 getDebugLocFromInstOrOperands(OldInduction)); 3320 3321 // Emit phis for the new starting index of the scalar loop. 3322 createInductionResumeValues(Lp, CountRoundDown); 3323 3324 return completeLoopSkeleton(Lp, OrigLoopID); 3325 } 3326 3327 // Fix up external users of the induction variable. At this point, we are 3328 // in LCSSA form, with all external PHIs that use the IV having one input value, 3329 // coming from the remainder loop. We need those PHIs to also have a correct 3330 // value for the IV when arriving directly from the middle block. 3331 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3332 const InductionDescriptor &II, 3333 Value *CountRoundDown, Value *EndValue, 3334 BasicBlock *MiddleBlock) { 3335 // There are two kinds of external IV usages - those that use the value 3336 // computed in the last iteration (the PHI) and those that use the penultimate 3337 // value (the value that feeds into the phi from the loop latch). 3338 // We allow both, but they, obviously, have different values. 3339 3340 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3341 3342 DenseMap<Value *, Value *> MissingVals; 3343 3344 // An external user of the last iteration's value should see the value that 3345 // the remainder loop uses to initialize its own IV. 3346 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3347 for (User *U : PostInc->users()) { 3348 Instruction *UI = cast<Instruction>(U); 3349 if (!OrigLoop->contains(UI)) { 3350 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3351 MissingVals[UI] = EndValue; 3352 } 3353 } 3354 3355 // An external user of the penultimate value need to see EndValue - Step. 3356 // The simplest way to get this is to recompute it from the constituent SCEVs, 3357 // that is Start + (Step * (CRD - 1)). 3358 for (User *U : OrigPhi->users()) { 3359 auto *UI = cast<Instruction>(U); 3360 if (!OrigLoop->contains(UI)) { 3361 const DataLayout &DL = 3362 OrigLoop->getHeader()->getModule()->getDataLayout(); 3363 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3364 3365 IRBuilder<> B(MiddleBlock->getTerminator()); 3366 Value *CountMinusOne = B.CreateSub( 3367 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3368 Value *CMO = 3369 !II.getStep()->getType()->isIntegerTy() 3370 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3371 II.getStep()->getType()) 3372 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3373 CMO->setName("cast.cmo"); 3374 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3375 Escape->setName("ind.escape"); 3376 MissingVals[UI] = Escape; 3377 } 3378 } 3379 3380 for (auto &I : MissingVals) { 3381 PHINode *PHI = cast<PHINode>(I.first); 3382 // One corner case we have to handle is two IVs "chasing" each-other, 3383 // that is %IV2 = phi [...], [ %IV1, %latch ] 3384 // In this case, if IV1 has an external use, we need to avoid adding both 3385 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3386 // don't already have an incoming value for the middle block. 3387 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3388 PHI->addIncoming(I.second, MiddleBlock); 3389 } 3390 } 3391 3392 namespace { 3393 3394 struct CSEDenseMapInfo { 3395 static bool canHandle(const Instruction *I) { 3396 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3397 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3398 } 3399 3400 static inline Instruction *getEmptyKey() { 3401 return DenseMapInfo<Instruction *>::getEmptyKey(); 3402 } 3403 3404 static inline Instruction *getTombstoneKey() { 3405 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3406 } 3407 3408 static unsigned getHashValue(const Instruction *I) { 3409 assert(canHandle(I) && "Unknown instruction!"); 3410 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3411 I->value_op_end())); 3412 } 3413 3414 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3415 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3416 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3417 return LHS == RHS; 3418 return LHS->isIdenticalTo(RHS); 3419 } 3420 }; 3421 3422 } // end anonymous namespace 3423 3424 ///Perform cse of induction variable instructions. 3425 static void cse(BasicBlock *BB) { 3426 // Perform simple cse. 3427 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3428 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3429 Instruction *In = &*I++; 3430 3431 if (!CSEDenseMapInfo::canHandle(In)) 3432 continue; 3433 3434 // Check if we can replace this instruction with any of the 3435 // visited instructions. 3436 if (Instruction *V = CSEMap.lookup(In)) { 3437 In->replaceAllUsesWith(V); 3438 In->eraseFromParent(); 3439 continue; 3440 } 3441 3442 CSEMap[In] = In; 3443 } 3444 } 3445 3446 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3447 ElementCount VF, 3448 bool &NeedToScalarize) { 3449 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3450 Function *F = CI->getCalledFunction(); 3451 Type *ScalarRetTy = CI->getType(); 3452 SmallVector<Type *, 4> Tys, ScalarTys; 3453 for (auto &ArgOp : CI->arg_operands()) 3454 ScalarTys.push_back(ArgOp->getType()); 3455 3456 // Estimate cost of scalarized vector call. The source operands are assumed 3457 // to be vectors, so we need to extract individual elements from there, 3458 // execute VF scalar calls, and then gather the result into the vector return 3459 // value. 3460 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3461 TTI::TCK_RecipThroughput); 3462 if (VF.isScalar()) 3463 return ScalarCallCost; 3464 3465 // Compute corresponding vector type for return value and arguments. 3466 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3467 for (Type *ScalarTy : ScalarTys) 3468 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3469 3470 // Compute costs of unpacking argument values for the scalar calls and 3471 // packing the return values to a vector. 3472 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3473 3474 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3475 3476 // If we can't emit a vector call for this function, then the currently found 3477 // cost is the cost we need to return. 3478 NeedToScalarize = true; 3479 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3480 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3481 3482 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3483 return Cost; 3484 3485 // If the corresponding vector cost is cheaper, return its cost. 3486 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3487 TTI::TCK_RecipThroughput); 3488 if (VectorCallCost < Cost) { 3489 NeedToScalarize = false; 3490 return VectorCallCost; 3491 } 3492 return Cost; 3493 } 3494 3495 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3496 ElementCount VF) { 3497 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3498 assert(ID && "Expected intrinsic call!"); 3499 3500 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3501 return TTI.getIntrinsicInstrCost(CostAttrs, 3502 TargetTransformInfo::TCK_RecipThroughput); 3503 } 3504 3505 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3506 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3507 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3508 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3509 } 3510 3511 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3512 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3513 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3514 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3515 } 3516 3517 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3518 // For every instruction `I` in MinBWs, truncate the operands, create a 3519 // truncated version of `I` and reextend its result. InstCombine runs 3520 // later and will remove any ext/trunc pairs. 3521 SmallPtrSet<Value *, 4> Erased; 3522 for (const auto &KV : Cost->getMinimalBitwidths()) { 3523 // If the value wasn't vectorized, we must maintain the original scalar 3524 // type. The absence of the value from VectorLoopValueMap indicates that it 3525 // wasn't vectorized. 3526 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3527 continue; 3528 for (unsigned Part = 0; Part < UF; ++Part) { 3529 Value *I = getOrCreateVectorValue(KV.first, Part); 3530 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3531 continue; 3532 Type *OriginalTy = I->getType(); 3533 Type *ScalarTruncatedTy = 3534 IntegerType::get(OriginalTy->getContext(), KV.second); 3535 auto *TruncatedTy = FixedVectorType::get( 3536 ScalarTruncatedTy, 3537 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3538 if (TruncatedTy == OriginalTy) 3539 continue; 3540 3541 IRBuilder<> B(cast<Instruction>(I)); 3542 auto ShrinkOperand = [&](Value *V) -> Value * { 3543 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3544 if (ZI->getSrcTy() == TruncatedTy) 3545 return ZI->getOperand(0); 3546 return B.CreateZExtOrTrunc(V, TruncatedTy); 3547 }; 3548 3549 // The actual instruction modification depends on the instruction type, 3550 // unfortunately. 3551 Value *NewI = nullptr; 3552 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3553 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3554 ShrinkOperand(BO->getOperand(1))); 3555 3556 // Any wrapping introduced by shrinking this operation shouldn't be 3557 // considered undefined behavior. So, we can't unconditionally copy 3558 // arithmetic wrapping flags to NewI. 3559 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3560 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3561 NewI = 3562 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3563 ShrinkOperand(CI->getOperand(1))); 3564 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3565 NewI = B.CreateSelect(SI->getCondition(), 3566 ShrinkOperand(SI->getTrueValue()), 3567 ShrinkOperand(SI->getFalseValue())); 3568 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3569 switch (CI->getOpcode()) { 3570 default: 3571 llvm_unreachable("Unhandled cast!"); 3572 case Instruction::Trunc: 3573 NewI = ShrinkOperand(CI->getOperand(0)); 3574 break; 3575 case Instruction::SExt: 3576 NewI = B.CreateSExtOrTrunc( 3577 CI->getOperand(0), 3578 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3579 break; 3580 case Instruction::ZExt: 3581 NewI = B.CreateZExtOrTrunc( 3582 CI->getOperand(0), 3583 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3584 break; 3585 } 3586 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3587 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3588 ->getNumElements(); 3589 auto *O0 = B.CreateZExtOrTrunc( 3590 SI->getOperand(0), 3591 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3592 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3593 ->getNumElements(); 3594 auto *O1 = B.CreateZExtOrTrunc( 3595 SI->getOperand(1), 3596 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3597 3598 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3599 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3600 // Don't do anything with the operands, just extend the result. 3601 continue; 3602 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3603 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3604 ->getNumElements(); 3605 auto *O0 = B.CreateZExtOrTrunc( 3606 IE->getOperand(0), 3607 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3608 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3609 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3610 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3611 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3612 ->getNumElements(); 3613 auto *O0 = B.CreateZExtOrTrunc( 3614 EE->getOperand(0), 3615 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3616 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3617 } else { 3618 // If we don't know what to do, be conservative and don't do anything. 3619 continue; 3620 } 3621 3622 // Lastly, extend the result. 3623 NewI->takeName(cast<Instruction>(I)); 3624 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3625 I->replaceAllUsesWith(Res); 3626 cast<Instruction>(I)->eraseFromParent(); 3627 Erased.insert(I); 3628 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3629 } 3630 } 3631 3632 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3633 for (const auto &KV : Cost->getMinimalBitwidths()) { 3634 // If the value wasn't vectorized, we must maintain the original scalar 3635 // type. The absence of the value from VectorLoopValueMap indicates that it 3636 // wasn't vectorized. 3637 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3638 continue; 3639 for (unsigned Part = 0; Part < UF; ++Part) { 3640 Value *I = getOrCreateVectorValue(KV.first, Part); 3641 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3642 if (Inst && Inst->use_empty()) { 3643 Value *NewI = Inst->getOperand(0); 3644 Inst->eraseFromParent(); 3645 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3646 } 3647 } 3648 } 3649 } 3650 3651 void InnerLoopVectorizer::fixVectorizedLoop() { 3652 // Insert truncates and extends for any truncated instructions as hints to 3653 // InstCombine. 3654 if (VF.isVector()) 3655 truncateToMinimalBitwidths(); 3656 3657 // Fix widened non-induction PHIs by setting up the PHI operands. 3658 if (OrigPHIsToFix.size()) { 3659 assert(EnableVPlanNativePath && 3660 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3661 fixNonInductionPHIs(); 3662 } 3663 3664 // At this point every instruction in the original loop is widened to a 3665 // vector form. Now we need to fix the recurrences in the loop. These PHI 3666 // nodes are currently empty because we did not want to introduce cycles. 3667 // This is the second stage of vectorizing recurrences. 3668 fixCrossIterationPHIs(); 3669 3670 // Forget the original basic block. 3671 PSE.getSE()->forgetLoop(OrigLoop); 3672 3673 // Fix-up external users of the induction variables. 3674 for (auto &Entry : Legal->getInductionVars()) 3675 fixupIVUsers(Entry.first, Entry.second, 3676 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3677 IVEndValues[Entry.first], LoopMiddleBlock); 3678 3679 fixLCSSAPHIs(); 3680 for (Instruction *PI : PredicatedInstructions) 3681 sinkScalarOperands(&*PI); 3682 3683 // Remove redundant induction instructions. 3684 cse(LoopVectorBody); 3685 3686 // Set/update profile weights for the vector and remainder loops as original 3687 // loop iterations are now distributed among them. Note that original loop 3688 // represented by LoopScalarBody becomes remainder loop after vectorization. 3689 // 3690 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3691 // end up getting slightly roughened result but that should be OK since 3692 // profile is not inherently precise anyway. Note also possible bypass of 3693 // vector code caused by legality checks is ignored, assigning all the weight 3694 // to the vector loop, optimistically. 3695 assert(!VF.isScalable() && 3696 "cannot use scalable ElementCount to determine unroll factor"); 3697 setProfileInfoAfterUnrolling( 3698 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3699 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3700 } 3701 3702 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3703 // In order to support recurrences we need to be able to vectorize Phi nodes. 3704 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3705 // stage #2: We now need to fix the recurrences by adding incoming edges to 3706 // the currently empty PHI nodes. At this point every instruction in the 3707 // original loop is widened to a vector form so we can use them to construct 3708 // the incoming edges. 3709 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3710 // Handle first-order recurrences and reductions that need to be fixed. 3711 if (Legal->isFirstOrderRecurrence(&Phi)) 3712 fixFirstOrderRecurrence(&Phi); 3713 else if (Legal->isReductionVariable(&Phi)) 3714 fixReduction(&Phi); 3715 } 3716 } 3717 3718 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3719 // This is the second phase of vectorizing first-order recurrences. An 3720 // overview of the transformation is described below. Suppose we have the 3721 // following loop. 3722 // 3723 // for (int i = 0; i < n; ++i) 3724 // b[i] = a[i] - a[i - 1]; 3725 // 3726 // There is a first-order recurrence on "a". For this loop, the shorthand 3727 // scalar IR looks like: 3728 // 3729 // scalar.ph: 3730 // s_init = a[-1] 3731 // br scalar.body 3732 // 3733 // scalar.body: 3734 // i = phi [0, scalar.ph], [i+1, scalar.body] 3735 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3736 // s2 = a[i] 3737 // b[i] = s2 - s1 3738 // br cond, scalar.body, ... 3739 // 3740 // In this example, s1 is a recurrence because it's value depends on the 3741 // previous iteration. In the first phase of vectorization, we created a 3742 // temporary value for s1. We now complete the vectorization and produce the 3743 // shorthand vector IR shown below (for VF = 4, UF = 1). 3744 // 3745 // vector.ph: 3746 // v_init = vector(..., ..., ..., a[-1]) 3747 // br vector.body 3748 // 3749 // vector.body 3750 // i = phi [0, vector.ph], [i+4, vector.body] 3751 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3752 // v2 = a[i, i+1, i+2, i+3]; 3753 // v3 = vector(v1(3), v2(0, 1, 2)) 3754 // b[i, i+1, i+2, i+3] = v2 - v3 3755 // br cond, vector.body, middle.block 3756 // 3757 // middle.block: 3758 // x = v2(3) 3759 // br scalar.ph 3760 // 3761 // scalar.ph: 3762 // s_init = phi [x, middle.block], [a[-1], otherwise] 3763 // br scalar.body 3764 // 3765 // After execution completes the vector loop, we extract the next value of 3766 // the recurrence (x) to use as the initial value in the scalar loop. 3767 3768 // Get the original loop preheader and single loop latch. 3769 auto *Preheader = OrigLoop->getLoopPreheader(); 3770 auto *Latch = OrigLoop->getLoopLatch(); 3771 3772 // Get the initial and previous values of the scalar recurrence. 3773 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3774 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3775 3776 // Create a vector from the initial value. 3777 auto *VectorInit = ScalarInit; 3778 if (VF.isVector()) { 3779 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3780 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 3781 VectorInit = Builder.CreateInsertElement( 3782 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3783 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 3784 } 3785 3786 // We constructed a temporary phi node in the first phase of vectorization. 3787 // This phi node will eventually be deleted. 3788 Builder.SetInsertPoint( 3789 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3790 3791 // Create a phi node for the new recurrence. The current value will either be 3792 // the initial value inserted into a vector or loop-varying vector value. 3793 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3794 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3795 3796 // Get the vectorized previous value of the last part UF - 1. It appears last 3797 // among all unrolled iterations, due to the order of their construction. 3798 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3799 3800 // Find and set the insertion point after the previous value if it is an 3801 // instruction. 3802 BasicBlock::iterator InsertPt; 3803 // Note that the previous value may have been constant-folded so it is not 3804 // guaranteed to be an instruction in the vector loop. 3805 // FIXME: Loop invariant values do not form recurrences. We should deal with 3806 // them earlier. 3807 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3808 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3809 else { 3810 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3811 if (isa<PHINode>(PreviousLastPart)) 3812 // If the previous value is a phi node, we should insert after all the phi 3813 // nodes in the block containing the PHI to avoid breaking basic block 3814 // verification. Note that the basic block may be different to 3815 // LoopVectorBody, in case we predicate the loop. 3816 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3817 else 3818 InsertPt = ++PreviousInst->getIterator(); 3819 } 3820 Builder.SetInsertPoint(&*InsertPt); 3821 3822 // We will construct a vector for the recurrence by combining the values for 3823 // the current and previous iterations. This is the required shuffle mask. 3824 assert(!VF.isScalable()); 3825 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 3826 ShuffleMask[0] = VF.getKnownMinValue() - 1; 3827 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 3828 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 3829 3830 // The vector from which to take the initial value for the current iteration 3831 // (actual or unrolled). Initially, this is the vector phi node. 3832 Value *Incoming = VecPhi; 3833 3834 // Shuffle the current and previous vector and update the vector parts. 3835 for (unsigned Part = 0; Part < UF; ++Part) { 3836 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3837 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3838 auto *Shuffle = 3839 VF.isVector() 3840 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 3841 : Incoming; 3842 PhiPart->replaceAllUsesWith(Shuffle); 3843 cast<Instruction>(PhiPart)->eraseFromParent(); 3844 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3845 Incoming = PreviousPart; 3846 } 3847 3848 // Fix the latch value of the new recurrence in the vector loop. 3849 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3850 3851 // Extract the last vector element in the middle block. This will be the 3852 // initial value for the recurrence when jumping to the scalar loop. 3853 auto *ExtractForScalar = Incoming; 3854 if (VF.isVector()) { 3855 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3856 ExtractForScalar = Builder.CreateExtractElement( 3857 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 3858 "vector.recur.extract"); 3859 } 3860 // Extract the second last element in the middle block if the 3861 // Phi is used outside the loop. We need to extract the phi itself 3862 // and not the last element (the phi update in the current iteration). This 3863 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3864 // when the scalar loop is not run at all. 3865 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3866 if (VF.isVector()) 3867 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3868 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 3869 "vector.recur.extract.for.phi"); 3870 // When loop is unrolled without vectorizing, initialize 3871 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3872 // `Incoming`. This is analogous to the vectorized case above: extracting the 3873 // second last element when VF > 1. 3874 else if (UF > 1) 3875 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3876 3877 // Fix the initial value of the original recurrence in the scalar loop. 3878 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3879 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3880 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3881 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3882 Start->addIncoming(Incoming, BB); 3883 } 3884 3885 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3886 Phi->setName("scalar.recur"); 3887 3888 // Finally, fix users of the recurrence outside the loop. The users will need 3889 // either the last value of the scalar recurrence or the last value of the 3890 // vector recurrence we extracted in the middle block. Since the loop is in 3891 // LCSSA form, we just need to find all the phi nodes for the original scalar 3892 // recurrence in the exit block, and then add an edge for the middle block. 3893 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3894 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3895 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3896 } 3897 } 3898 } 3899 3900 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3901 Constant *Zero = Builder.getInt32(0); 3902 3903 // Get it's reduction variable descriptor. 3904 assert(Legal->isReductionVariable(Phi) && 3905 "Unable to find the reduction variable"); 3906 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3907 3908 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3909 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3910 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3911 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3912 RdxDesc.getMinMaxRecurrenceKind(); 3913 setDebugLocFromInst(Builder, ReductionStartValue); 3914 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 3915 3916 // We need to generate a reduction vector from the incoming scalar. 3917 // To do so, we need to generate the 'identity' vector and override 3918 // one of the elements with the incoming scalar reduction. We need 3919 // to do it in the vector-loop preheader. 3920 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3921 3922 // This is the vector-clone of the value that leaves the loop. 3923 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3924 3925 // Find the reduction identity variable. Zero for addition, or, xor, 3926 // one for multiplication, -1 for And. 3927 Value *Identity; 3928 Value *VectorStart; 3929 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3930 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3931 // MinMax reduction have the start value as their identify. 3932 if (VF == 1 || IsInLoopReductionPhi) { 3933 VectorStart = Identity = ReductionStartValue; 3934 } else { 3935 VectorStart = Identity = 3936 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3937 } 3938 } else { 3939 // Handle other reduction kinds: 3940 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3941 RK, VecTy->getScalarType()); 3942 if (VF == 1 || IsInLoopReductionPhi) { 3943 Identity = Iden; 3944 // This vector is the Identity vector where the first element is the 3945 // incoming scalar reduction. 3946 VectorStart = ReductionStartValue; 3947 } else { 3948 Identity = ConstantVector::getSplat(VF, Iden); 3949 3950 // This vector is the Identity vector where the first element is the 3951 // incoming scalar reduction. 3952 VectorStart = 3953 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3954 } 3955 } 3956 3957 // Wrap flags are in general invalid after vectorization, clear them. 3958 clearReductionWrapFlags(RdxDesc); 3959 3960 // Fix the vector-loop phi. 3961 3962 // Reductions do not have to start at zero. They can start with 3963 // any loop invariant values. 3964 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3965 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3966 3967 for (unsigned Part = 0; Part < UF; ++Part) { 3968 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3969 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3970 // Make sure to add the reduction start value only to the 3971 // first unroll part. 3972 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3973 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3974 cast<PHINode>(VecRdxPhi) 3975 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3976 } 3977 3978 // Before each round, move the insertion point right between 3979 // the PHIs and the values we are going to write. 3980 // This allows us to write both PHINodes and the extractelement 3981 // instructions. 3982 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3983 3984 setDebugLocFromInst(Builder, LoopExitInst); 3985 3986 // If tail is folded by masking, the vector value to leave the loop should be 3987 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3988 // instead of the former. 3989 if (Cost->foldTailByMasking()) { 3990 for (unsigned Part = 0; Part < UF; ++Part) { 3991 Value *VecLoopExitInst = 3992 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3993 Value *Sel = nullptr; 3994 for (User *U : VecLoopExitInst->users()) { 3995 if (isa<SelectInst>(U)) { 3996 assert(!Sel && "Reduction exit feeding two selects"); 3997 Sel = U; 3998 } else 3999 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4000 } 4001 assert(Sel && "Reduction exit feeds no select"); 4002 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4003 4004 // If the target can create a predicated operator for the reduction at no 4005 // extra cost in the loop (for example a predicated vadd), it can be 4006 // cheaper for the select to remain in the loop than be sunk out of it, 4007 // and so use the select value for the phi instead of the old 4008 // LoopExitValue. 4009 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4010 if (PreferPredicatedReductionSelect || 4011 TTI->preferPredicatedReductionSelect( 4012 RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), 4013 Phi->getType(), TargetTransformInfo::ReductionFlags())) { 4014 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4015 VecRdxPhi->setIncomingValueForBlock( 4016 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4017 } 4018 } 4019 } 4020 4021 // If the vector reduction can be performed in a smaller type, we truncate 4022 // then extend the loop exit value to enable InstCombine to evaluate the 4023 // entire expression in the smaller type. 4024 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4025 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4026 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4027 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4028 Builder.SetInsertPoint( 4029 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4030 VectorParts RdxParts(UF); 4031 for (unsigned Part = 0; Part < UF; ++Part) { 4032 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4033 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4034 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4035 : Builder.CreateZExt(Trunc, VecTy); 4036 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4037 UI != RdxParts[Part]->user_end();) 4038 if (*UI != Trunc) { 4039 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4040 RdxParts[Part] = Extnd; 4041 } else { 4042 ++UI; 4043 } 4044 } 4045 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4046 for (unsigned Part = 0; Part < UF; ++Part) { 4047 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4048 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4049 } 4050 } 4051 4052 // Reduce all of the unrolled parts into a single vector. 4053 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4054 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4055 4056 // The middle block terminator has already been assigned a DebugLoc here (the 4057 // OrigLoop's single latch terminator). We want the whole middle block to 4058 // appear to execute on this line because: (a) it is all compiler generated, 4059 // (b) these instructions are always executed after evaluating the latch 4060 // conditional branch, and (c) other passes may add new predecessors which 4061 // terminate on this line. This is the easiest way to ensure we don't 4062 // accidentally cause an extra step back into the loop while debugging. 4063 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4064 for (unsigned Part = 1; Part < UF; ++Part) { 4065 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4066 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4067 // Floating point operations had to be 'fast' to enable the reduction. 4068 ReducedPartRdx = addFastMathFlag( 4069 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4070 ReducedPartRdx, "bin.rdx"), 4071 RdxDesc.getFastMathFlags()); 4072 else 4073 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4074 RdxPart); 4075 } 4076 4077 // Create the reduction after the loop. Note that inloop reductions create the 4078 // target reduction in the loop using a Reduction recipe. 4079 if (VF.isVector() && !IsInLoopReductionPhi) { 4080 bool NoNaN = Legal->hasFunNoNaNAttr(); 4081 ReducedPartRdx = 4082 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4083 // If the reduction can be performed in a smaller type, we need to extend 4084 // the reduction to the wider type before we branch to the original loop. 4085 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4086 ReducedPartRdx = 4087 RdxDesc.isSigned() 4088 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4089 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4090 } 4091 4092 // Create a phi node that merges control-flow from the backedge-taken check 4093 // block and the middle block. 4094 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4095 LoopScalarPreHeader->getTerminator()); 4096 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4097 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4098 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4099 4100 // Now, we need to fix the users of the reduction variable 4101 // inside and outside of the scalar remainder loop. 4102 // We know that the loop is in LCSSA form. We need to update the 4103 // PHI nodes in the exit blocks. 4104 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4105 // All PHINodes need to have a single entry edge, or two if 4106 // we already fixed them. 4107 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4108 4109 // We found a reduction value exit-PHI. Update it with the 4110 // incoming bypass edge. 4111 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4112 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4113 } // end of the LCSSA phi scan. 4114 4115 // Fix the scalar loop reduction variable with the incoming reduction sum 4116 // from the vector body and from the backedge value. 4117 int IncomingEdgeBlockIdx = 4118 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4119 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4120 // Pick the other block. 4121 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4122 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4123 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4124 } 4125 4126 void InnerLoopVectorizer::clearReductionWrapFlags( 4127 RecurrenceDescriptor &RdxDesc) { 4128 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4129 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4130 RK != RecurrenceDescriptor::RK_IntegerMult) 4131 return; 4132 4133 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4134 assert(LoopExitInstr && "null loop exit instruction"); 4135 SmallVector<Instruction *, 8> Worklist; 4136 SmallPtrSet<Instruction *, 8> Visited; 4137 Worklist.push_back(LoopExitInstr); 4138 Visited.insert(LoopExitInstr); 4139 4140 while (!Worklist.empty()) { 4141 Instruction *Cur = Worklist.pop_back_val(); 4142 if (isa<OverflowingBinaryOperator>(Cur)) 4143 for (unsigned Part = 0; Part < UF; ++Part) { 4144 Value *V = getOrCreateVectorValue(Cur, Part); 4145 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4146 } 4147 4148 for (User *U : Cur->users()) { 4149 Instruction *UI = cast<Instruction>(U); 4150 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4151 Visited.insert(UI).second) 4152 Worklist.push_back(UI); 4153 } 4154 } 4155 } 4156 4157 void InnerLoopVectorizer::fixLCSSAPHIs() { 4158 assert(!VF.isScalable() && "the code below assumes fixed width vectors"); 4159 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4160 if (LCSSAPhi.getNumIncomingValues() == 1) { 4161 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4162 // Non-instruction incoming values will have only one value. 4163 unsigned LastLane = 0; 4164 if (isa<Instruction>(IncomingValue)) 4165 LastLane = Cost->isUniformAfterVectorization( 4166 cast<Instruction>(IncomingValue), VF) 4167 ? 0 4168 : VF.getKnownMinValue() - 1; 4169 // Can be a loop invariant incoming value or the last scalar value to be 4170 // extracted from the vectorized loop. 4171 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4172 Value *lastIncomingValue = 4173 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4174 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4175 } 4176 } 4177 } 4178 4179 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4180 // The basic block and loop containing the predicated instruction. 4181 auto *PredBB = PredInst->getParent(); 4182 auto *VectorLoop = LI->getLoopFor(PredBB); 4183 4184 // Initialize a worklist with the operands of the predicated instruction. 4185 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4186 4187 // Holds instructions that we need to analyze again. An instruction may be 4188 // reanalyzed if we don't yet know if we can sink it or not. 4189 SmallVector<Instruction *, 8> InstsToReanalyze; 4190 4191 // Returns true if a given use occurs in the predicated block. Phi nodes use 4192 // their operands in their corresponding predecessor blocks. 4193 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4194 auto *I = cast<Instruction>(U.getUser()); 4195 BasicBlock *BB = I->getParent(); 4196 if (auto *Phi = dyn_cast<PHINode>(I)) 4197 BB = Phi->getIncomingBlock( 4198 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4199 return BB == PredBB; 4200 }; 4201 4202 // Iteratively sink the scalarized operands of the predicated instruction 4203 // into the block we created for it. When an instruction is sunk, it's 4204 // operands are then added to the worklist. The algorithm ends after one pass 4205 // through the worklist doesn't sink a single instruction. 4206 bool Changed; 4207 do { 4208 // Add the instructions that need to be reanalyzed to the worklist, and 4209 // reset the changed indicator. 4210 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4211 InstsToReanalyze.clear(); 4212 Changed = false; 4213 4214 while (!Worklist.empty()) { 4215 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4216 4217 // We can't sink an instruction if it is a phi node, is already in the 4218 // predicated block, is not in the loop, or may have side effects. 4219 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4220 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4221 continue; 4222 4223 // It's legal to sink the instruction if all its uses occur in the 4224 // predicated block. Otherwise, there's nothing to do yet, and we may 4225 // need to reanalyze the instruction. 4226 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4227 InstsToReanalyze.push_back(I); 4228 continue; 4229 } 4230 4231 // Move the instruction to the beginning of the predicated block, and add 4232 // it's operands to the worklist. 4233 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4234 Worklist.insert(I->op_begin(), I->op_end()); 4235 4236 // The sinking may have enabled other instructions to be sunk, so we will 4237 // need to iterate. 4238 Changed = true; 4239 } 4240 } while (Changed); 4241 } 4242 4243 void InnerLoopVectorizer::fixNonInductionPHIs() { 4244 for (PHINode *OrigPhi : OrigPHIsToFix) { 4245 PHINode *NewPhi = 4246 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4247 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4248 4249 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4250 predecessors(OrigPhi->getParent())); 4251 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4252 predecessors(NewPhi->getParent())); 4253 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4254 "Scalar and Vector BB should have the same number of predecessors"); 4255 4256 // The insertion point in Builder may be invalidated by the time we get 4257 // here. Force the Builder insertion point to something valid so that we do 4258 // not run into issues during insertion point restore in 4259 // getOrCreateVectorValue calls below. 4260 Builder.SetInsertPoint(NewPhi); 4261 4262 // The predecessor order is preserved and we can rely on mapping between 4263 // scalar and vector block predecessors. 4264 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4265 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4266 4267 // When looking up the new scalar/vector values to fix up, use incoming 4268 // values from original phi. 4269 Value *ScIncV = 4270 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4271 4272 // Scalar incoming value may need a broadcast 4273 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4274 NewPhi->addIncoming(NewIncV, NewPredBB); 4275 } 4276 } 4277 } 4278 4279 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, 4280 unsigned UF, ElementCount VF, 4281 bool IsPtrLoopInvariant, 4282 SmallBitVector &IsIndexLoopInvariant, 4283 VPTransformState &State) { 4284 // Construct a vector GEP by widening the operands of the scalar GEP as 4285 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4286 // results in a vector of pointers when at least one operand of the GEP 4287 // is vector-typed. Thus, to keep the representation compact, we only use 4288 // vector-typed operands for loop-varying values. 4289 4290 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4291 // If we are vectorizing, but the GEP has only loop-invariant operands, 4292 // the GEP we build (by only using vector-typed operands for 4293 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4294 // produce a vector of pointers, we need to either arbitrarily pick an 4295 // operand to broadcast, or broadcast a clone of the original GEP. 4296 // Here, we broadcast a clone of the original. 4297 // 4298 // TODO: If at some point we decide to scalarize instructions having 4299 // loop-invariant operands, this special case will no longer be 4300 // required. We would add the scalarization decision to 4301 // collectLoopScalars() and teach getVectorValue() to broadcast 4302 // the lane-zero scalar value. 4303 auto *Clone = Builder.Insert(GEP->clone()); 4304 for (unsigned Part = 0; Part < UF; ++Part) { 4305 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4306 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4307 addMetadata(EntryPart, GEP); 4308 } 4309 } else { 4310 // If the GEP has at least one loop-varying operand, we are sure to 4311 // produce a vector of pointers. But if we are only unrolling, we want 4312 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4313 // produce with the code below will be scalar (if VF == 1) or vector 4314 // (otherwise). Note that for the unroll-only case, we still maintain 4315 // values in the vector mapping with initVector, as we do for other 4316 // instructions. 4317 for (unsigned Part = 0; Part < UF; ++Part) { 4318 // The pointer operand of the new GEP. If it's loop-invariant, we 4319 // won't broadcast it. 4320 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4321 : State.get(Operands.getOperand(0), Part); 4322 4323 // Collect all the indices for the new GEP. If any index is 4324 // loop-invariant, we won't broadcast it. 4325 SmallVector<Value *, 4> Indices; 4326 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4327 VPValue *Operand = Operands.getOperand(I); 4328 if (IsIndexLoopInvariant[I - 1]) 4329 Indices.push_back(State.get(Operand, {0, 0})); 4330 else 4331 Indices.push_back(State.get(Operand, Part)); 4332 } 4333 4334 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4335 // but it should be a vector, otherwise. 4336 auto *NewGEP = 4337 GEP->isInBounds() 4338 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4339 Indices) 4340 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4341 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4342 "NewGEP is not a pointer vector"); 4343 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4344 addMetadata(NewGEP, GEP); 4345 } 4346 } 4347 } 4348 4349 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4350 ElementCount VF) { 4351 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4352 PHINode *P = cast<PHINode>(PN); 4353 if (EnableVPlanNativePath) { 4354 // Currently we enter here in the VPlan-native path for non-induction 4355 // PHIs where all control flow is uniform. We simply widen these PHIs. 4356 // Create a vector phi with no operands - the vector phi operands will be 4357 // set at the end of vector code generation. 4358 Type *VecTy = 4359 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4360 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4361 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4362 OrigPHIsToFix.push_back(P); 4363 4364 return; 4365 } 4366 4367 assert(PN->getParent() == OrigLoop->getHeader() && 4368 "Non-header phis should have been handled elsewhere"); 4369 4370 // In order to support recurrences we need to be able to vectorize Phi nodes. 4371 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4372 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4373 // this value when we vectorize all of the instructions that use the PHI. 4374 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4375 for (unsigned Part = 0; Part < UF; ++Part) { 4376 // This is phase one of vectorizing PHIs. 4377 bool ScalarPHI = 4378 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4379 Type *VecTy = 4380 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4381 Value *EntryPart = PHINode::Create( 4382 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4383 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4384 } 4385 return; 4386 } 4387 4388 setDebugLocFromInst(Builder, P); 4389 4390 // This PHINode must be an induction variable. 4391 // Make sure that we know about it. 4392 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4393 4394 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4395 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4396 4397 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4398 // which can be found from the original scalar operations. 4399 switch (II.getKind()) { 4400 case InductionDescriptor::IK_NoInduction: 4401 llvm_unreachable("Unknown induction"); 4402 case InductionDescriptor::IK_IntInduction: 4403 case InductionDescriptor::IK_FpInduction: 4404 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4405 case InductionDescriptor::IK_PtrInduction: { 4406 // Handle the pointer induction variable case. 4407 assert(P->getType()->isPointerTy() && "Unexpected type."); 4408 4409 if (Cost->isScalarAfterVectorization(P, VF)) { 4410 // This is the normalized GEP that starts counting at zero. 4411 Value *PtrInd = 4412 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4413 // Determine the number of scalars we need to generate for each unroll 4414 // iteration. If the instruction is uniform, we only need to generate the 4415 // first lane. Otherwise, we generate all VF values. 4416 unsigned Lanes = 4417 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4418 for (unsigned Part = 0; Part < UF; ++Part) { 4419 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4420 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4421 Lane + Part * VF.getKnownMinValue()); 4422 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4423 Value *SclrGep = 4424 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4425 SclrGep->setName("next.gep"); 4426 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4427 } 4428 } 4429 return; 4430 } 4431 assert(isa<SCEVConstant>(II.getStep()) && 4432 "Induction step not a SCEV constant!"); 4433 Type *PhiType = II.getStep()->getType(); 4434 4435 // Build a pointer phi 4436 Value *ScalarStartValue = II.getStartValue(); 4437 Type *ScStValueType = ScalarStartValue->getType(); 4438 PHINode *NewPointerPhi = 4439 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4440 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4441 4442 // A pointer induction, performed by using a gep 4443 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4444 Instruction *InductionLoc = LoopLatch->getTerminator(); 4445 const SCEV *ScalarStep = II.getStep(); 4446 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4447 Value *ScalarStepValue = 4448 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4449 Value *InductionGEP = GetElementPtrInst::Create( 4450 ScStValueType->getPointerElementType(), NewPointerPhi, 4451 Builder.CreateMul( 4452 ScalarStepValue, 4453 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4454 "ptr.ind", InductionLoc); 4455 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4456 4457 // Create UF many actual address geps that use the pointer 4458 // phi as base and a vectorized version of the step value 4459 // (<step*0, ..., step*N>) as offset. 4460 for (unsigned Part = 0; Part < UF; ++Part) { 4461 SmallVector<Constant *, 8> Indices; 4462 // Create a vector of consecutive numbers from zero to VF. 4463 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4464 Indices.push_back( 4465 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4466 Constant *StartOffset = ConstantVector::get(Indices); 4467 4468 Value *GEP = Builder.CreateGEP( 4469 ScStValueType->getPointerElementType(), NewPointerPhi, 4470 Builder.CreateMul( 4471 StartOffset, 4472 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4473 "vector.gep")); 4474 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4475 } 4476 } 4477 } 4478 } 4479 4480 /// A helper function for checking whether an integer division-related 4481 /// instruction may divide by zero (in which case it must be predicated if 4482 /// executed conditionally in the scalar code). 4483 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4484 /// Non-zero divisors that are non compile-time constants will not be 4485 /// converted into multiplication, so we will still end up scalarizing 4486 /// the division, but can do so w/o predication. 4487 static bool mayDivideByZero(Instruction &I) { 4488 assert((I.getOpcode() == Instruction::UDiv || 4489 I.getOpcode() == Instruction::SDiv || 4490 I.getOpcode() == Instruction::URem || 4491 I.getOpcode() == Instruction::SRem) && 4492 "Unexpected instruction"); 4493 Value *Divisor = I.getOperand(1); 4494 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4495 return !CInt || CInt->isZero(); 4496 } 4497 4498 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, 4499 VPTransformState &State) { 4500 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4501 switch (I.getOpcode()) { 4502 case Instruction::Call: 4503 case Instruction::Br: 4504 case Instruction::PHI: 4505 case Instruction::GetElementPtr: 4506 case Instruction::Select: 4507 llvm_unreachable("This instruction is handled by a different recipe."); 4508 case Instruction::UDiv: 4509 case Instruction::SDiv: 4510 case Instruction::SRem: 4511 case Instruction::URem: 4512 case Instruction::Add: 4513 case Instruction::FAdd: 4514 case Instruction::Sub: 4515 case Instruction::FSub: 4516 case Instruction::FNeg: 4517 case Instruction::Mul: 4518 case Instruction::FMul: 4519 case Instruction::FDiv: 4520 case Instruction::FRem: 4521 case Instruction::Shl: 4522 case Instruction::LShr: 4523 case Instruction::AShr: 4524 case Instruction::And: 4525 case Instruction::Or: 4526 case Instruction::Xor: { 4527 // Just widen unops and binops. 4528 setDebugLocFromInst(Builder, &I); 4529 4530 for (unsigned Part = 0; Part < UF; ++Part) { 4531 SmallVector<Value *, 2> Ops; 4532 for (VPValue *VPOp : User.operands()) 4533 Ops.push_back(State.get(VPOp, Part)); 4534 4535 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4536 4537 if (auto *VecOp = dyn_cast<Instruction>(V)) 4538 VecOp->copyIRFlags(&I); 4539 4540 // Use this vector value for all users of the original instruction. 4541 VectorLoopValueMap.setVectorValue(&I, Part, V); 4542 addMetadata(V, &I); 4543 } 4544 4545 break; 4546 } 4547 case Instruction::ICmp: 4548 case Instruction::FCmp: { 4549 // Widen compares. Generate vector compares. 4550 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4551 auto *Cmp = cast<CmpInst>(&I); 4552 setDebugLocFromInst(Builder, Cmp); 4553 for (unsigned Part = 0; Part < UF; ++Part) { 4554 Value *A = State.get(User.getOperand(0), Part); 4555 Value *B = State.get(User.getOperand(1), Part); 4556 Value *C = nullptr; 4557 if (FCmp) { 4558 // Propagate fast math flags. 4559 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4560 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4561 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4562 } else { 4563 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4564 } 4565 VectorLoopValueMap.setVectorValue(&I, Part, C); 4566 addMetadata(C, &I); 4567 } 4568 4569 break; 4570 } 4571 4572 case Instruction::ZExt: 4573 case Instruction::SExt: 4574 case Instruction::FPToUI: 4575 case Instruction::FPToSI: 4576 case Instruction::FPExt: 4577 case Instruction::PtrToInt: 4578 case Instruction::IntToPtr: 4579 case Instruction::SIToFP: 4580 case Instruction::UIToFP: 4581 case Instruction::Trunc: 4582 case Instruction::FPTrunc: 4583 case Instruction::BitCast: { 4584 auto *CI = cast<CastInst>(&I); 4585 setDebugLocFromInst(Builder, CI); 4586 4587 /// Vectorize casts. 4588 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4589 Type *DestTy = 4590 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4591 4592 for (unsigned Part = 0; Part < UF; ++Part) { 4593 Value *A = State.get(User.getOperand(0), Part); 4594 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4595 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4596 addMetadata(Cast, &I); 4597 } 4598 break; 4599 } 4600 default: 4601 // This instruction is not vectorized by simple widening. 4602 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4603 llvm_unreachable("Unhandled instruction!"); 4604 } // end of switch. 4605 } 4606 4607 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, 4608 VPTransformState &State) { 4609 assert(!isa<DbgInfoIntrinsic>(I) && 4610 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4611 setDebugLocFromInst(Builder, &I); 4612 4613 Module *M = I.getParent()->getParent()->getParent(); 4614 auto *CI = cast<CallInst>(&I); 4615 4616 SmallVector<Type *, 4> Tys; 4617 for (Value *ArgOperand : CI->arg_operands()) 4618 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4619 4620 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4621 4622 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4623 // version of the instruction. 4624 // Is it beneficial to perform intrinsic call compared to lib call? 4625 bool NeedToScalarize = false; 4626 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4627 bool UseVectorIntrinsic = 4628 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4629 assert((UseVectorIntrinsic || !NeedToScalarize) && 4630 "Instruction should be scalarized elsewhere."); 4631 4632 for (unsigned Part = 0; Part < UF; ++Part) { 4633 SmallVector<Value *, 4> Args; 4634 for (auto &I : enumerate(ArgOperands.operands())) { 4635 // Some intrinsics have a scalar argument - don't replace it with a 4636 // vector. 4637 Value *Arg; 4638 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4639 Arg = State.get(I.value(), Part); 4640 else 4641 Arg = State.get(I.value(), {0, 0}); 4642 Args.push_back(Arg); 4643 } 4644 4645 Function *VectorF; 4646 if (UseVectorIntrinsic) { 4647 // Use vector version of the intrinsic. 4648 Type *TysForDecl[] = {CI->getType()}; 4649 if (VF.isVector()) { 4650 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4651 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4652 } 4653 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4654 assert(VectorF && "Can't retrieve vector intrinsic."); 4655 } else { 4656 // Use vector version of the function call. 4657 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4658 #ifndef NDEBUG 4659 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4660 "Can't create vector function."); 4661 #endif 4662 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4663 } 4664 SmallVector<OperandBundleDef, 1> OpBundles; 4665 CI->getOperandBundlesAsDefs(OpBundles); 4666 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4667 4668 if (isa<FPMathOperator>(V)) 4669 V->copyFastMathFlags(CI); 4670 4671 VectorLoopValueMap.setVectorValue(&I, Part, V); 4672 addMetadata(V, &I); 4673 } 4674 } 4675 4676 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, 4677 VPUser &Operands, 4678 bool InvariantCond, 4679 VPTransformState &State) { 4680 setDebugLocFromInst(Builder, &I); 4681 4682 // The condition can be loop invariant but still defined inside the 4683 // loop. This means that we can't just use the original 'cond' value. 4684 // We have to take the 'vectorized' value and pick the first lane. 4685 // Instcombine will make this a no-op. 4686 auto *InvarCond = 4687 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4688 4689 for (unsigned Part = 0; Part < UF; ++Part) { 4690 Value *Cond = 4691 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4692 Value *Op0 = State.get(Operands.getOperand(1), Part); 4693 Value *Op1 = State.get(Operands.getOperand(2), Part); 4694 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4695 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4696 addMetadata(Sel, &I); 4697 } 4698 } 4699 4700 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4701 // We should not collect Scalars more than once per VF. Right now, this 4702 // function is called from collectUniformsAndScalars(), which already does 4703 // this check. Collecting Scalars for VF=1 does not make any sense. 4704 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4705 "This function should not be visited twice for the same VF"); 4706 4707 SmallSetVector<Instruction *, 8> Worklist; 4708 4709 // These sets are used to seed the analysis with pointers used by memory 4710 // accesses that will remain scalar. 4711 SmallSetVector<Instruction *, 8> ScalarPtrs; 4712 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4713 auto *Latch = TheLoop->getLoopLatch(); 4714 4715 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4716 // The pointer operands of loads and stores will be scalar as long as the 4717 // memory access is not a gather or scatter operation. The value operand of a 4718 // store will remain scalar if the store is scalarized. 4719 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4720 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4721 assert(WideningDecision != CM_Unknown && 4722 "Widening decision should be ready at this moment"); 4723 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4724 if (Ptr == Store->getValueOperand()) 4725 return WideningDecision == CM_Scalarize; 4726 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4727 "Ptr is neither a value or pointer operand"); 4728 return WideningDecision != CM_GatherScatter; 4729 }; 4730 4731 // A helper that returns true if the given value is a bitcast or 4732 // getelementptr instruction contained in the loop. 4733 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4734 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4735 isa<GetElementPtrInst>(V)) && 4736 !TheLoop->isLoopInvariant(V); 4737 }; 4738 4739 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4740 if (!isa<PHINode>(Ptr) || 4741 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4742 return false; 4743 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4744 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4745 return false; 4746 return isScalarUse(MemAccess, Ptr); 4747 }; 4748 4749 // A helper that evaluates a memory access's use of a pointer. If the 4750 // pointer is actually the pointer induction of a loop, it is being 4751 // inserted into Worklist. If the use will be a scalar use, and the 4752 // pointer is only used by memory accesses, we place the pointer in 4753 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4754 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4755 if (isScalarPtrInduction(MemAccess, Ptr)) { 4756 Worklist.insert(cast<Instruction>(Ptr)); 4757 Instruction *Update = cast<Instruction>( 4758 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4759 Worklist.insert(Update); 4760 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4761 << "\n"); 4762 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4763 << "\n"); 4764 return; 4765 } 4766 // We only care about bitcast and getelementptr instructions contained in 4767 // the loop. 4768 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4769 return; 4770 4771 // If the pointer has already been identified as scalar (e.g., if it was 4772 // also identified as uniform), there's nothing to do. 4773 auto *I = cast<Instruction>(Ptr); 4774 if (Worklist.count(I)) 4775 return; 4776 4777 // If the use of the pointer will be a scalar use, and all users of the 4778 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4779 // place the pointer in PossibleNonScalarPtrs. 4780 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4781 return isa<LoadInst>(U) || isa<StoreInst>(U); 4782 })) 4783 ScalarPtrs.insert(I); 4784 else 4785 PossibleNonScalarPtrs.insert(I); 4786 }; 4787 4788 // We seed the scalars analysis with three classes of instructions: (1) 4789 // instructions marked uniform-after-vectorization and (2) bitcast, 4790 // getelementptr and (pointer) phi instructions used by memory accesses 4791 // requiring a scalar use. 4792 // 4793 // (1) Add to the worklist all instructions that have been identified as 4794 // uniform-after-vectorization. 4795 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4796 4797 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4798 // memory accesses requiring a scalar use. The pointer operands of loads and 4799 // stores will be scalar as long as the memory accesses is not a gather or 4800 // scatter operation. The value operand of a store will remain scalar if the 4801 // store is scalarized. 4802 for (auto *BB : TheLoop->blocks()) 4803 for (auto &I : *BB) { 4804 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4805 evaluatePtrUse(Load, Load->getPointerOperand()); 4806 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4807 evaluatePtrUse(Store, Store->getPointerOperand()); 4808 evaluatePtrUse(Store, Store->getValueOperand()); 4809 } 4810 } 4811 for (auto *I : ScalarPtrs) 4812 if (!PossibleNonScalarPtrs.count(I)) { 4813 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4814 Worklist.insert(I); 4815 } 4816 4817 // Insert the forced scalars. 4818 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4819 // induction variable when the PHI user is scalarized. 4820 auto ForcedScalar = ForcedScalars.find(VF); 4821 if (ForcedScalar != ForcedScalars.end()) 4822 for (auto *I : ForcedScalar->second) 4823 Worklist.insert(I); 4824 4825 // Expand the worklist by looking through any bitcasts and getelementptr 4826 // instructions we've already identified as scalar. This is similar to the 4827 // expansion step in collectLoopUniforms(); however, here we're only 4828 // expanding to include additional bitcasts and getelementptr instructions. 4829 unsigned Idx = 0; 4830 while (Idx != Worklist.size()) { 4831 Instruction *Dst = Worklist[Idx++]; 4832 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4833 continue; 4834 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4835 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4836 auto *J = cast<Instruction>(U); 4837 return !TheLoop->contains(J) || Worklist.count(J) || 4838 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4839 isScalarUse(J, Src)); 4840 })) { 4841 Worklist.insert(Src); 4842 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4843 } 4844 } 4845 4846 // An induction variable will remain scalar if all users of the induction 4847 // variable and induction variable update remain scalar. 4848 for (auto &Induction : Legal->getInductionVars()) { 4849 auto *Ind = Induction.first; 4850 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4851 4852 // If tail-folding is applied, the primary induction variable will be used 4853 // to feed a vector compare. 4854 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4855 continue; 4856 4857 // Determine if all users of the induction variable are scalar after 4858 // vectorization. 4859 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4860 auto *I = cast<Instruction>(U); 4861 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4862 }); 4863 if (!ScalarInd) 4864 continue; 4865 4866 // Determine if all users of the induction variable update instruction are 4867 // scalar after vectorization. 4868 auto ScalarIndUpdate = 4869 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4870 auto *I = cast<Instruction>(U); 4871 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4872 }); 4873 if (!ScalarIndUpdate) 4874 continue; 4875 4876 // The induction variable and its update instruction will remain scalar. 4877 Worklist.insert(Ind); 4878 Worklist.insert(IndUpdate); 4879 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4880 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4881 << "\n"); 4882 } 4883 4884 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4885 } 4886 4887 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 4888 ElementCount VF) { 4889 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4890 if (!blockNeedsPredication(I->getParent())) 4891 return false; 4892 switch(I->getOpcode()) { 4893 default: 4894 break; 4895 case Instruction::Load: 4896 case Instruction::Store: { 4897 if (!Legal->isMaskRequired(I)) 4898 return false; 4899 auto *Ptr = getLoadStorePointerOperand(I); 4900 auto *Ty = getMemInstValueType(I); 4901 // We have already decided how to vectorize this instruction, get that 4902 // result. 4903 if (VF.isVector()) { 4904 InstWidening WideningDecision = getWideningDecision(I, VF); 4905 assert(WideningDecision != CM_Unknown && 4906 "Widening decision should be ready at this moment"); 4907 return WideningDecision == CM_Scalarize; 4908 } 4909 const Align Alignment = getLoadStoreAlignment(I); 4910 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4911 isLegalMaskedGather(Ty, Alignment)) 4912 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4913 isLegalMaskedScatter(Ty, Alignment)); 4914 } 4915 case Instruction::UDiv: 4916 case Instruction::SDiv: 4917 case Instruction::SRem: 4918 case Instruction::URem: 4919 return mayDivideByZero(*I); 4920 } 4921 return false; 4922 } 4923 4924 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4925 Instruction *I, ElementCount VF) { 4926 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4927 assert(getWideningDecision(I, VF) == CM_Unknown && 4928 "Decision should not be set yet."); 4929 auto *Group = getInterleavedAccessGroup(I); 4930 assert(Group && "Must have a group."); 4931 4932 // If the instruction's allocated size doesn't equal it's type size, it 4933 // requires padding and will be scalarized. 4934 auto &DL = I->getModule()->getDataLayout(); 4935 auto *ScalarTy = getMemInstValueType(I); 4936 if (hasIrregularType(ScalarTy, DL, VF)) 4937 return false; 4938 4939 // Check if masking is required. 4940 // A Group may need masking for one of two reasons: it resides in a block that 4941 // needs predication, or it was decided to use masking to deal with gaps. 4942 bool PredicatedAccessRequiresMasking = 4943 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4944 bool AccessWithGapsRequiresMasking = 4945 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4946 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4947 return true; 4948 4949 // If masked interleaving is required, we expect that the user/target had 4950 // enabled it, because otherwise it either wouldn't have been created or 4951 // it should have been invalidated by the CostModel. 4952 assert(useMaskedInterleavedAccesses(TTI) && 4953 "Masked interleave-groups for predicated accesses are not enabled."); 4954 4955 auto *Ty = getMemInstValueType(I); 4956 const Align Alignment = getLoadStoreAlignment(I); 4957 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4958 : TTI.isLegalMaskedStore(Ty, Alignment); 4959 } 4960 4961 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4962 Instruction *I, ElementCount VF) { 4963 // Get and ensure we have a valid memory instruction. 4964 LoadInst *LI = dyn_cast<LoadInst>(I); 4965 StoreInst *SI = dyn_cast<StoreInst>(I); 4966 assert((LI || SI) && "Invalid memory instruction"); 4967 4968 auto *Ptr = getLoadStorePointerOperand(I); 4969 4970 // In order to be widened, the pointer should be consecutive, first of all. 4971 if (!Legal->isConsecutivePtr(Ptr)) 4972 return false; 4973 4974 // If the instruction is a store located in a predicated block, it will be 4975 // scalarized. 4976 if (isScalarWithPredication(I)) 4977 return false; 4978 4979 // If the instruction's allocated size doesn't equal it's type size, it 4980 // requires padding and will be scalarized. 4981 auto &DL = I->getModule()->getDataLayout(); 4982 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4983 if (hasIrregularType(ScalarTy, DL, VF)) 4984 return false; 4985 4986 return true; 4987 } 4988 4989 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4990 // We should not collect Uniforms more than once per VF. Right now, 4991 // this function is called from collectUniformsAndScalars(), which 4992 // already does this check. Collecting Uniforms for VF=1 does not make any 4993 // sense. 4994 4995 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4996 "This function should not be visited twice for the same VF"); 4997 4998 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4999 // not analyze again. Uniforms.count(VF) will return 1. 5000 Uniforms[VF].clear(); 5001 5002 // We now know that the loop is vectorizable! 5003 // Collect instructions inside the loop that will remain uniform after 5004 // vectorization. 5005 5006 // Global values, params and instructions outside of current loop are out of 5007 // scope. 5008 auto isOutOfScope = [&](Value *V) -> bool { 5009 Instruction *I = dyn_cast<Instruction>(V); 5010 return (!I || !TheLoop->contains(I)); 5011 }; 5012 5013 SetVector<Instruction *> Worklist; 5014 BasicBlock *Latch = TheLoop->getLoopLatch(); 5015 5016 // Instructions that are scalar with predication must not be considered 5017 // uniform after vectorization, because that would create an erroneous 5018 // replicating region where only a single instance out of VF should be formed. 5019 // TODO: optimize such seldom cases if found important, see PR40816. 5020 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5021 if (isScalarWithPredication(I, VF)) { 5022 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5023 << *I << "\n"); 5024 return; 5025 } 5026 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5027 Worklist.insert(I); 5028 }; 5029 5030 // Start with the conditional branch. If the branch condition is an 5031 // instruction contained in the loop that is only used by the branch, it is 5032 // uniform. 5033 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5034 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5035 addToWorklistIfAllowed(Cmp); 5036 5037 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 5038 // are pointers that are treated like consecutive pointers during 5039 // vectorization. The pointer operands of interleaved accesses are an 5040 // example. 5041 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 5042 5043 // Holds pointer operands of instructions that are possibly non-uniform. 5044 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 5045 5046 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5047 InstWidening WideningDecision = getWideningDecision(I, VF); 5048 assert(WideningDecision != CM_Unknown && 5049 "Widening decision should be ready at this moment"); 5050 5051 return (WideningDecision == CM_Widen || 5052 WideningDecision == CM_Widen_Reverse || 5053 WideningDecision == CM_Interleave); 5054 }; 5055 // Iterate over the instructions in the loop, and collect all 5056 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 5057 // that a consecutive-like pointer operand will be scalarized, we collect it 5058 // in PossibleNonUniformPtrs instead. We use two sets here because a single 5059 // getelementptr instruction can be used by both vectorized and scalarized 5060 // memory instructions. For example, if a loop loads and stores from the same 5061 // location, but the store is conditional, the store will be scalarized, and 5062 // the getelementptr won't remain uniform. 5063 for (auto *BB : TheLoop->blocks()) 5064 for (auto &I : *BB) { 5065 // If there's no pointer operand, there's nothing to do. 5066 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5067 if (!Ptr) 5068 continue; 5069 5070 // True if all users of Ptr are memory accesses that have Ptr as their 5071 // pointer operand. 5072 auto UsersAreMemAccesses = 5073 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 5074 return getLoadStorePointerOperand(U) == Ptr; 5075 }); 5076 5077 // Ensure the memory instruction will not be scalarized or used by 5078 // gather/scatter, making its pointer operand non-uniform. If the pointer 5079 // operand is used by any instruction other than a memory access, we 5080 // conservatively assume the pointer operand may be non-uniform. 5081 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 5082 PossibleNonUniformPtrs.insert(Ptr); 5083 5084 // If the memory instruction will be vectorized and its pointer operand 5085 // is consecutive-like, or interleaving - the pointer operand should 5086 // remain uniform. 5087 else 5088 ConsecutiveLikePtrs.insert(Ptr); 5089 } 5090 5091 // Add to the Worklist all consecutive and consecutive-like pointers that 5092 // aren't also identified as possibly non-uniform. 5093 for (auto *V : ConsecutiveLikePtrs) 5094 if (!PossibleNonUniformPtrs.count(V)) 5095 addToWorklistIfAllowed(V); 5096 5097 // Expand Worklist in topological order: whenever a new instruction 5098 // is added , its users should be already inside Worklist. It ensures 5099 // a uniform instruction will only be used by uniform instructions. 5100 unsigned idx = 0; 5101 while (idx != Worklist.size()) { 5102 Instruction *I = Worklist[idx++]; 5103 5104 for (auto OV : I->operand_values()) { 5105 // isOutOfScope operands cannot be uniform instructions. 5106 if (isOutOfScope(OV)) 5107 continue; 5108 // First order recurrence Phi's should typically be considered 5109 // non-uniform. 5110 auto *OP = dyn_cast<PHINode>(OV); 5111 if (OP && Legal->isFirstOrderRecurrence(OP)) 5112 continue; 5113 // If all the users of the operand are uniform, then add the 5114 // operand into the uniform worklist. 5115 auto *OI = cast<Instruction>(OV); 5116 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5117 auto *J = cast<Instruction>(U); 5118 return Worklist.count(J) || 5119 (OI == getLoadStorePointerOperand(J) && 5120 isUniformDecision(J, VF)); 5121 })) 5122 addToWorklistIfAllowed(OI); 5123 } 5124 } 5125 5126 // Returns true if Ptr is the pointer operand of a memory access instruction 5127 // I, and I is known to not require scalarization. 5128 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5129 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5130 }; 5131 5132 // For an instruction to be added into Worklist above, all its users inside 5133 // the loop should also be in Worklist. However, this condition cannot be 5134 // true for phi nodes that form a cyclic dependence. We must process phi 5135 // nodes separately. An induction variable will remain uniform if all users 5136 // of the induction variable and induction variable update remain uniform. 5137 // The code below handles both pointer and non-pointer induction variables. 5138 for (auto &Induction : Legal->getInductionVars()) { 5139 auto *Ind = Induction.first; 5140 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5141 5142 // Determine if all users of the induction variable are uniform after 5143 // vectorization. 5144 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5145 auto *I = cast<Instruction>(U); 5146 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5147 isVectorizedMemAccessUse(I, Ind); 5148 }); 5149 if (!UniformInd) 5150 continue; 5151 5152 // Determine if all users of the induction variable update instruction are 5153 // uniform after vectorization. 5154 auto UniformIndUpdate = 5155 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5156 auto *I = cast<Instruction>(U); 5157 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5158 isVectorizedMemAccessUse(I, IndUpdate); 5159 }); 5160 if (!UniformIndUpdate) 5161 continue; 5162 5163 // The induction variable and its update instruction will remain uniform. 5164 addToWorklistIfAllowed(Ind); 5165 addToWorklistIfAllowed(IndUpdate); 5166 } 5167 5168 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5169 } 5170 5171 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5172 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5173 5174 if (Legal->getRuntimePointerChecking()->Need) { 5175 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5176 "runtime pointer checks needed. Enable vectorization of this " 5177 "loop with '#pragma clang loop vectorize(enable)' when " 5178 "compiling with -Os/-Oz", 5179 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5180 return true; 5181 } 5182 5183 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5184 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5185 "runtime SCEV checks needed. Enable vectorization of this " 5186 "loop with '#pragma clang loop vectorize(enable)' when " 5187 "compiling with -Os/-Oz", 5188 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5189 return true; 5190 } 5191 5192 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5193 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5194 reportVectorizationFailure("Runtime stride check for small trip count", 5195 "runtime stride == 1 checks needed. Enable vectorization of " 5196 "this loop without such check by compiling with -Os/-Oz", 5197 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5198 return true; 5199 } 5200 5201 return false; 5202 } 5203 5204 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF, 5205 unsigned UserIC) { 5206 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5207 // TODO: It may by useful to do since it's still likely to be dynamically 5208 // uniform if the target can skip. 5209 reportVectorizationFailure( 5210 "Not inserting runtime ptr check for divergent target", 5211 "runtime pointer checks needed. Not enabled for divergent target", 5212 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5213 return None; 5214 } 5215 5216 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5217 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5218 if (TC == 1) { 5219 reportVectorizationFailure("Single iteration (non) loop", 5220 "loop trip count is one, irrelevant for vectorization", 5221 "SingleIterationLoop", ORE, TheLoop); 5222 return None; 5223 } 5224 5225 switch (ScalarEpilogueStatus) { 5226 case CM_ScalarEpilogueAllowed: 5227 return UserVF ? UserVF : computeFeasibleMaxVF(TC); 5228 case CM_ScalarEpilogueNotNeededUsePredicate: 5229 LLVM_DEBUG( 5230 dbgs() << "LV: vector predicate hint/switch found.\n" 5231 << "LV: Not allowing scalar epilogue, creating predicated " 5232 << "vector loop.\n"); 5233 break; 5234 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5235 // fallthrough as a special case of OptForSize 5236 case CM_ScalarEpilogueNotAllowedOptSize: 5237 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5238 LLVM_DEBUG( 5239 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5240 else 5241 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5242 << "count.\n"); 5243 5244 // Bail if runtime checks are required, which are not good when optimising 5245 // for size. 5246 if (runtimeChecksRequired()) 5247 return None; 5248 break; 5249 } 5250 5251 // Now try the tail folding 5252 5253 // Invalidate interleave groups that require an epilogue if we can't mask 5254 // the interleave-group. 5255 if (!useMaskedInterleavedAccesses(TTI)) { 5256 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5257 "No decisions should have been taken at this point"); 5258 // Note: There is no need to invalidate any cost modeling decisions here, as 5259 // non where taken so far. 5260 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5261 } 5262 5263 unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC); 5264 assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2"); 5265 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF; 5266 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5267 // Accept MaxVF if we do not have a tail. 5268 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5269 return MaxVF; 5270 } 5271 5272 // If we don't know the precise trip count, or if the trip count that we 5273 // found modulo the vectorization factor is not zero, try to fold the tail 5274 // by masking. 5275 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5276 if (Legal->prepareToFoldTailByMasking()) { 5277 FoldTailByMasking = true; 5278 return MaxVF; 5279 } 5280 5281 // If there was a tail-folding hint/switch, but we can't fold the tail by 5282 // masking, fallback to a vectorization with a scalar epilogue. 5283 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5284 if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) { 5285 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5286 return None; 5287 } 5288 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5289 "scalar epilogue instead.\n"); 5290 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5291 return MaxVF; 5292 } 5293 5294 if (TC == 0) { 5295 reportVectorizationFailure( 5296 "Unable to calculate the loop count due to complex control flow", 5297 "unable to calculate the loop count due to complex control flow", 5298 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5299 return None; 5300 } 5301 5302 reportVectorizationFailure( 5303 "Cannot optimize for size and vectorize at the same time.", 5304 "cannot optimize for size and vectorize at the same time. " 5305 "Enable vectorization of this loop with '#pragma clang loop " 5306 "vectorize(enable)' when compiling with -Os/-Oz", 5307 "NoTailLoopWithOptForSize", ORE, TheLoop); 5308 return None; 5309 } 5310 5311 unsigned 5312 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5313 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5314 unsigned SmallestType, WidestType; 5315 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5316 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5317 5318 // Get the maximum safe dependence distance in bits computed by LAA. 5319 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5320 // the memory accesses that is most restrictive (involved in the smallest 5321 // dependence distance). 5322 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5323 5324 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5325 5326 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5327 // Note that both WidestRegister and WidestType may not be a powers of 2. 5328 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5329 5330 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5331 << " / " << WidestType << " bits.\n"); 5332 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5333 << WidestRegister << " bits.\n"); 5334 5335 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5336 " into one vector!"); 5337 if (MaxVectorSize == 0) { 5338 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5339 MaxVectorSize = 1; 5340 return MaxVectorSize; 5341 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5342 isPowerOf2_32(ConstTripCount)) { 5343 // We need to clamp the VF to be the ConstTripCount. There is no point in 5344 // choosing a higher viable VF as done in the loop below. 5345 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5346 << ConstTripCount << "\n"); 5347 MaxVectorSize = ConstTripCount; 5348 return MaxVectorSize; 5349 } 5350 5351 unsigned MaxVF = MaxVectorSize; 5352 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5353 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5354 // Collect all viable vectorization factors larger than the default MaxVF 5355 // (i.e. MaxVectorSize). 5356 SmallVector<ElementCount, 8> VFs; 5357 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5358 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5359 VFs.push_back(ElementCount::getFixed(VS)); 5360 5361 // For each VF calculate its register usage. 5362 auto RUs = calculateRegisterUsage(VFs); 5363 5364 // Select the largest VF which doesn't require more registers than existing 5365 // ones. 5366 for (int i = RUs.size() - 1; i >= 0; --i) { 5367 bool Selected = true; 5368 for (auto& pair : RUs[i].MaxLocalUsers) { 5369 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5370 if (pair.second > TargetNumRegisters) 5371 Selected = false; 5372 } 5373 if (Selected) { 5374 MaxVF = VFs[i].getKnownMinValue(); 5375 break; 5376 } 5377 } 5378 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5379 if (MaxVF < MinVF) { 5380 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5381 << ") with target's minimum: " << MinVF << '\n'); 5382 MaxVF = MinVF; 5383 } 5384 } 5385 } 5386 return MaxVF; 5387 } 5388 5389 VectorizationFactor 5390 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5391 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5392 const float ScalarCost = Cost; 5393 unsigned Width = 1; 5394 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5395 5396 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5397 if (ForceVectorization && MaxVF > 1) { 5398 // Ignore scalar width, because the user explicitly wants vectorization. 5399 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5400 // evaluation. 5401 Cost = std::numeric_limits<float>::max(); 5402 } 5403 5404 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5405 // Notice that the vector loop needs to be executed less times, so 5406 // we need to divide the cost of the vector loops by the width of 5407 // the vector elements. 5408 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5409 float VectorCost = C.first / (float)i; 5410 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5411 << " costs: " << (int)VectorCost << ".\n"); 5412 if (!C.second && !ForceVectorization) { 5413 LLVM_DEBUG( 5414 dbgs() << "LV: Not considering vector loop of width " << i 5415 << " because it will not generate any vector instructions.\n"); 5416 continue; 5417 } 5418 if (VectorCost < Cost) { 5419 Cost = VectorCost; 5420 Width = i; 5421 } 5422 } 5423 5424 if (!EnableCondStoresVectorization && NumPredStores) { 5425 reportVectorizationFailure("There are conditional stores.", 5426 "store that is conditionally executed prevents vectorization", 5427 "ConditionalStore", ORE, TheLoop); 5428 Width = 1; 5429 Cost = ScalarCost; 5430 } 5431 5432 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5433 << "LV: Vectorization seems to be not beneficial, " 5434 << "but was forced by a user.\n"); 5435 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5436 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5437 (unsigned)(Width * Cost)}; 5438 return Factor; 5439 } 5440 5441 std::pair<unsigned, unsigned> 5442 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5443 unsigned MinWidth = -1U; 5444 unsigned MaxWidth = 8; 5445 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5446 5447 // For each block. 5448 for (BasicBlock *BB : TheLoop->blocks()) { 5449 // For each instruction in the loop. 5450 for (Instruction &I : BB->instructionsWithoutDebug()) { 5451 Type *T = I.getType(); 5452 5453 // Skip ignored values. 5454 if (ValuesToIgnore.count(&I)) 5455 continue; 5456 5457 // Only examine Loads, Stores and PHINodes. 5458 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5459 continue; 5460 5461 // Examine PHI nodes that are reduction variables. Update the type to 5462 // account for the recurrence type. 5463 if (auto *PN = dyn_cast<PHINode>(&I)) { 5464 if (!Legal->isReductionVariable(PN)) 5465 continue; 5466 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5467 T = RdxDesc.getRecurrenceType(); 5468 } 5469 5470 // Examine the stored values. 5471 if (auto *ST = dyn_cast<StoreInst>(&I)) 5472 T = ST->getValueOperand()->getType(); 5473 5474 // Ignore loaded pointer types and stored pointer types that are not 5475 // vectorizable. 5476 // 5477 // FIXME: The check here attempts to predict whether a load or store will 5478 // be vectorized. We only know this for certain after a VF has 5479 // been selected. Here, we assume that if an access can be 5480 // vectorized, it will be. We should also look at extending this 5481 // optimization to non-pointer types. 5482 // 5483 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5484 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5485 continue; 5486 5487 MinWidth = std::min(MinWidth, 5488 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5489 MaxWidth = std::max(MaxWidth, 5490 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5491 } 5492 } 5493 5494 return {MinWidth, MaxWidth}; 5495 } 5496 5497 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5498 unsigned LoopCost) { 5499 // -- The interleave heuristics -- 5500 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5501 // There are many micro-architectural considerations that we can't predict 5502 // at this level. For example, frontend pressure (on decode or fetch) due to 5503 // code size, or the number and capabilities of the execution ports. 5504 // 5505 // We use the following heuristics to select the interleave count: 5506 // 1. If the code has reductions, then we interleave to break the cross 5507 // iteration dependency. 5508 // 2. If the loop is really small, then we interleave to reduce the loop 5509 // overhead. 5510 // 3. We don't interleave if we think that we will spill registers to memory 5511 // due to the increased register pressure. 5512 5513 if (!isScalarEpilogueAllowed()) 5514 return 1; 5515 5516 // We used the distance for the interleave count. 5517 if (Legal->getMaxSafeDepDistBytes() != -1U) 5518 return 1; 5519 5520 // Do not interleave loops with a relatively small known or estimated trip 5521 // count. 5522 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5523 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5524 return 1; 5525 5526 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5527 // We divide by these constants so assume that we have at least one 5528 // instruction that uses at least one register. 5529 for (auto& pair : R.MaxLocalUsers) { 5530 pair.second = std::max(pair.second, 1U); 5531 } 5532 5533 // We calculate the interleave count using the following formula. 5534 // Subtract the number of loop invariants from the number of available 5535 // registers. These registers are used by all of the interleaved instances. 5536 // Next, divide the remaining registers by the number of registers that is 5537 // required by the loop, in order to estimate how many parallel instances 5538 // fit without causing spills. All of this is rounded down if necessary to be 5539 // a power of two. We want power of two interleave count to simplify any 5540 // addressing operations or alignment considerations. 5541 // We also want power of two interleave counts to ensure that the induction 5542 // variable of the vector loop wraps to zero, when tail is folded by masking; 5543 // this currently happens when OptForSize, in which case IC is set to 1 above. 5544 unsigned IC = UINT_MAX; 5545 5546 for (auto& pair : R.MaxLocalUsers) { 5547 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5548 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5549 << " registers of " 5550 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5551 if (VF == 1) { 5552 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5553 TargetNumRegisters = ForceTargetNumScalarRegs; 5554 } else { 5555 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5556 TargetNumRegisters = ForceTargetNumVectorRegs; 5557 } 5558 unsigned MaxLocalUsers = pair.second; 5559 unsigned LoopInvariantRegs = 0; 5560 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5561 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5562 5563 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5564 // Don't count the induction variable as interleaved. 5565 if (EnableIndVarRegisterHeur) { 5566 TmpIC = 5567 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5568 std::max(1U, (MaxLocalUsers - 1))); 5569 } 5570 5571 IC = std::min(IC, TmpIC); 5572 } 5573 5574 // Clamp the interleave ranges to reasonable counts. 5575 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5576 unsigned MaxInterleaveCount = 5577 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5578 5579 // Check if the user has overridden the max. 5580 if (VF == 1) { 5581 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5582 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5583 } else { 5584 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5585 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5586 } 5587 5588 // If trip count is known or estimated compile time constant, limit the 5589 // interleave count to be less than the trip count divided by VF. 5590 if (BestKnownTC) { 5591 MaxInterleaveCount = 5592 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5593 } 5594 5595 // If we did not calculate the cost for VF (because the user selected the VF) 5596 // then we calculate the cost of VF here. 5597 if (LoopCost == 0) 5598 LoopCost = expectedCost(VF).first; 5599 5600 assert(LoopCost && "Non-zero loop cost expected"); 5601 5602 // Clamp the calculated IC to be between the 1 and the max interleave count 5603 // that the target and trip count allows. 5604 if (IC > MaxInterleaveCount) 5605 IC = MaxInterleaveCount; 5606 else if (IC < 1) 5607 IC = 1; 5608 5609 // Interleave if we vectorized this loop and there is a reduction that could 5610 // benefit from interleaving. 5611 if (VF.isVector() && !Legal->getReductionVars().empty()) { 5612 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5613 return IC; 5614 } 5615 5616 // Note that if we've already vectorized the loop we will have done the 5617 // runtime check and so interleaving won't require further checks. 5618 bool InterleavingRequiresRuntimePointerCheck = 5619 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5620 5621 // We want to interleave small loops in order to reduce the loop overhead and 5622 // potentially expose ILP opportunities. 5623 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5624 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5625 // We assume that the cost overhead is 1 and we use the cost model 5626 // to estimate the cost of the loop and interleave until the cost of the 5627 // loop overhead is about 5% of the cost of the loop. 5628 unsigned SmallIC = 5629 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5630 5631 // Interleave until store/load ports (estimated by max interleave count) are 5632 // saturated. 5633 unsigned NumStores = Legal->getNumStores(); 5634 unsigned NumLoads = Legal->getNumLoads(); 5635 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5636 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5637 5638 // If we have a scalar reduction (vector reductions are already dealt with 5639 // by this point), we can increase the critical path length if the loop 5640 // we're interleaving is inside another loop. Limit, by default to 2, so the 5641 // critical path only gets increased by one reduction operation. 5642 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5643 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5644 SmallIC = std::min(SmallIC, F); 5645 StoresIC = std::min(StoresIC, F); 5646 LoadsIC = std::min(LoadsIC, F); 5647 } 5648 5649 if (EnableLoadStoreRuntimeInterleave && 5650 std::max(StoresIC, LoadsIC) > SmallIC) { 5651 LLVM_DEBUG( 5652 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5653 return std::max(StoresIC, LoadsIC); 5654 } 5655 5656 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5657 return SmallIC; 5658 } 5659 5660 // Interleave if this is a large loop (small loops are already dealt with by 5661 // this point) that could benefit from interleaving. 5662 bool HasReductions = !Legal->getReductionVars().empty(); 5663 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5664 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5665 return IC; 5666 } 5667 5668 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5669 return 1; 5670 } 5671 5672 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5673 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5674 // This function calculates the register usage by measuring the highest number 5675 // of values that are alive at a single location. Obviously, this is a very 5676 // rough estimation. We scan the loop in a topological order in order and 5677 // assign a number to each instruction. We use RPO to ensure that defs are 5678 // met before their users. We assume that each instruction that has in-loop 5679 // users starts an interval. We record every time that an in-loop value is 5680 // used, so we have a list of the first and last occurrences of each 5681 // instruction. Next, we transpose this data structure into a multi map that 5682 // holds the list of intervals that *end* at a specific location. This multi 5683 // map allows us to perform a linear search. We scan the instructions linearly 5684 // and record each time that a new interval starts, by placing it in a set. 5685 // If we find this value in the multi-map then we remove it from the set. 5686 // The max register usage is the maximum size of the set. 5687 // We also search for instructions that are defined outside the loop, but are 5688 // used inside the loop. We need this number separately from the max-interval 5689 // usage number because when we unroll, loop-invariant values do not take 5690 // more register. 5691 LoopBlocksDFS DFS(TheLoop); 5692 DFS.perform(LI); 5693 5694 RegisterUsage RU; 5695 5696 // Each 'key' in the map opens a new interval. The values 5697 // of the map are the index of the 'last seen' usage of the 5698 // instruction that is the key. 5699 using IntervalMap = DenseMap<Instruction *, unsigned>; 5700 5701 // Maps instruction to its index. 5702 SmallVector<Instruction *, 64> IdxToInstr; 5703 // Marks the end of each interval. 5704 IntervalMap EndPoint; 5705 // Saves the list of instruction indices that are used in the loop. 5706 SmallPtrSet<Instruction *, 8> Ends; 5707 // Saves the list of values that are used in the loop but are 5708 // defined outside the loop, such as arguments and constants. 5709 SmallPtrSet<Value *, 8> LoopInvariants; 5710 5711 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5712 for (Instruction &I : BB->instructionsWithoutDebug()) { 5713 IdxToInstr.push_back(&I); 5714 5715 // Save the end location of each USE. 5716 for (Value *U : I.operands()) { 5717 auto *Instr = dyn_cast<Instruction>(U); 5718 5719 // Ignore non-instruction values such as arguments, constants, etc. 5720 if (!Instr) 5721 continue; 5722 5723 // If this instruction is outside the loop then record it and continue. 5724 if (!TheLoop->contains(Instr)) { 5725 LoopInvariants.insert(Instr); 5726 continue; 5727 } 5728 5729 // Overwrite previous end points. 5730 EndPoint[Instr] = IdxToInstr.size(); 5731 Ends.insert(Instr); 5732 } 5733 } 5734 } 5735 5736 // Saves the list of intervals that end with the index in 'key'. 5737 using InstrList = SmallVector<Instruction *, 2>; 5738 DenseMap<unsigned, InstrList> TransposeEnds; 5739 5740 // Transpose the EndPoints to a list of values that end at each index. 5741 for (auto &Interval : EndPoint) 5742 TransposeEnds[Interval.second].push_back(Interval.first); 5743 5744 SmallPtrSet<Instruction *, 8> OpenIntervals; 5745 5746 // Get the size of the widest register. 5747 unsigned MaxSafeDepDist = -1U; 5748 if (Legal->getMaxSafeDepDistBytes() != -1U) 5749 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5750 unsigned WidestRegister = 5751 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5752 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5753 5754 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5755 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5756 5757 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5758 5759 // A lambda that gets the register usage for the given type and VF. 5760 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) { 5761 if (Ty->isTokenTy()) 5762 return 0U; 5763 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5764 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5765 return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize / 5766 WidestRegister); 5767 }; 5768 5769 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5770 Instruction *I = IdxToInstr[i]; 5771 5772 // Remove all of the instructions that end at this location. 5773 InstrList &List = TransposeEnds[i]; 5774 for (Instruction *ToRemove : List) 5775 OpenIntervals.erase(ToRemove); 5776 5777 // Ignore instructions that are never used within the loop. 5778 if (!Ends.count(I)) 5779 continue; 5780 5781 // Skip ignored values. 5782 if (ValuesToIgnore.count(I)) 5783 continue; 5784 5785 // For each VF find the maximum usage of registers. 5786 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5787 // Count the number of live intervals. 5788 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5789 5790 if (VFs[j].isScalar()) { 5791 for (auto Inst : OpenIntervals) { 5792 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5793 if (RegUsage.find(ClassID) == RegUsage.end()) 5794 RegUsage[ClassID] = 1; 5795 else 5796 RegUsage[ClassID] += 1; 5797 } 5798 } else { 5799 collectUniformsAndScalars(VFs[j]); 5800 for (auto Inst : OpenIntervals) { 5801 // Skip ignored values for VF > 1. 5802 if (VecValuesToIgnore.count(Inst)) 5803 continue; 5804 if (isScalarAfterVectorization(Inst, VFs[j])) { 5805 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5806 if (RegUsage.find(ClassID) == RegUsage.end()) 5807 RegUsage[ClassID] = 1; 5808 else 5809 RegUsage[ClassID] += 1; 5810 } else { 5811 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5812 if (RegUsage.find(ClassID) == RegUsage.end()) 5813 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5814 else 5815 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5816 } 5817 } 5818 } 5819 5820 for (auto& pair : RegUsage) { 5821 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5822 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5823 else 5824 MaxUsages[j][pair.first] = pair.second; 5825 } 5826 } 5827 5828 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5829 << OpenIntervals.size() << '\n'); 5830 5831 // Add the current instruction to the list of open intervals. 5832 OpenIntervals.insert(I); 5833 } 5834 5835 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5836 SmallMapVector<unsigned, unsigned, 4> Invariant; 5837 5838 for (auto Inst : LoopInvariants) { 5839 unsigned Usage = 5840 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5841 unsigned ClassID = 5842 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5843 if (Invariant.find(ClassID) == Invariant.end()) 5844 Invariant[ClassID] = Usage; 5845 else 5846 Invariant[ClassID] += Usage; 5847 } 5848 5849 LLVM_DEBUG({ 5850 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5851 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5852 << " item\n"; 5853 for (const auto &pair : MaxUsages[i]) { 5854 dbgs() << "LV(REG): RegisterClass: " 5855 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5856 << " registers\n"; 5857 } 5858 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5859 << " item\n"; 5860 for (const auto &pair : Invariant) { 5861 dbgs() << "LV(REG): RegisterClass: " 5862 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5863 << " registers\n"; 5864 } 5865 }); 5866 5867 RU.LoopInvariantRegs = Invariant; 5868 RU.MaxLocalUsers = MaxUsages[i]; 5869 RUs[i] = RU; 5870 } 5871 5872 return RUs; 5873 } 5874 5875 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5876 // TODO: Cost model for emulated masked load/store is completely 5877 // broken. This hack guides the cost model to use an artificially 5878 // high enough value to practically disable vectorization with such 5879 // operations, except where previously deployed legality hack allowed 5880 // using very low cost values. This is to avoid regressions coming simply 5881 // from moving "masked load/store" check from legality to cost model. 5882 // Masked Load/Gather emulation was previously never allowed. 5883 // Limited number of Masked Store/Scatter emulation was allowed. 5884 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5885 return isa<LoadInst>(I) || 5886 (isa<StoreInst>(I) && 5887 NumPredStores > NumberOfStoresToPredicate); 5888 } 5889 5890 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5891 // If we aren't vectorizing the loop, or if we've already collected the 5892 // instructions to scalarize, there's nothing to do. Collection may already 5893 // have occurred if we have a user-selected VF and are now computing the 5894 // expected cost for interleaving. 5895 if (VF.isScalar() || VF.isZero() || 5896 InstsToScalarize.find(VF) != InstsToScalarize.end()) 5897 return; 5898 5899 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5900 // not profitable to scalarize any instructions, the presence of VF in the 5901 // map will indicate that we've analyzed it already. 5902 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5903 5904 // Find all the instructions that are scalar with predication in the loop and 5905 // determine if it would be better to not if-convert the blocks they are in. 5906 // If so, we also record the instructions to scalarize. 5907 for (BasicBlock *BB : TheLoop->blocks()) { 5908 if (!blockNeedsPredication(BB)) 5909 continue; 5910 for (Instruction &I : *BB) 5911 if (isScalarWithPredication(&I)) { 5912 ScalarCostsTy ScalarCosts; 5913 // Do not apply discount logic if hacked cost is needed 5914 // for emulated masked memrefs. 5915 if (!useEmulatedMaskMemRefHack(&I) && 5916 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5917 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5918 // Remember that BB will remain after vectorization. 5919 PredicatedBBsAfterVectorization.insert(BB); 5920 } 5921 } 5922 } 5923 5924 int LoopVectorizationCostModel::computePredInstDiscount( 5925 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5926 ElementCount VF) { 5927 assert(!isUniformAfterVectorization(PredInst, VF) && 5928 "Instruction marked uniform-after-vectorization will be predicated"); 5929 5930 // Initialize the discount to zero, meaning that the scalar version and the 5931 // vector version cost the same. 5932 int Discount = 0; 5933 5934 // Holds instructions to analyze. The instructions we visit are mapped in 5935 // ScalarCosts. Those instructions are the ones that would be scalarized if 5936 // we find that the scalar version costs less. 5937 SmallVector<Instruction *, 8> Worklist; 5938 5939 // Returns true if the given instruction can be scalarized. 5940 auto canBeScalarized = [&](Instruction *I) -> bool { 5941 // We only attempt to scalarize instructions forming a single-use chain 5942 // from the original predicated block that would otherwise be vectorized. 5943 // Although not strictly necessary, we give up on instructions we know will 5944 // already be scalar to avoid traversing chains that are unlikely to be 5945 // beneficial. 5946 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5947 isScalarAfterVectorization(I, VF)) 5948 return false; 5949 5950 // If the instruction is scalar with predication, it will be analyzed 5951 // separately. We ignore it within the context of PredInst. 5952 if (isScalarWithPredication(I)) 5953 return false; 5954 5955 // If any of the instruction's operands are uniform after vectorization, 5956 // the instruction cannot be scalarized. This prevents, for example, a 5957 // masked load from being scalarized. 5958 // 5959 // We assume we will only emit a value for lane zero of an instruction 5960 // marked uniform after vectorization, rather than VF identical values. 5961 // Thus, if we scalarize an instruction that uses a uniform, we would 5962 // create uses of values corresponding to the lanes we aren't emitting code 5963 // for. This behavior can be changed by allowing getScalarValue to clone 5964 // the lane zero values for uniforms rather than asserting. 5965 for (Use &U : I->operands()) 5966 if (auto *J = dyn_cast<Instruction>(U.get())) 5967 if (isUniformAfterVectorization(J, VF)) 5968 return false; 5969 5970 // Otherwise, we can scalarize the instruction. 5971 return true; 5972 }; 5973 5974 // Compute the expected cost discount from scalarizing the entire expression 5975 // feeding the predicated instruction. We currently only consider expressions 5976 // that are single-use instruction chains. 5977 Worklist.push_back(PredInst); 5978 while (!Worklist.empty()) { 5979 Instruction *I = Worklist.pop_back_val(); 5980 5981 // If we've already analyzed the instruction, there's nothing to do. 5982 if (ScalarCosts.find(I) != ScalarCosts.end()) 5983 continue; 5984 5985 // Compute the cost of the vector instruction. Note that this cost already 5986 // includes the scalarization overhead of the predicated instruction. 5987 unsigned VectorCost = getInstructionCost(I, VF).first; 5988 5989 // Compute the cost of the scalarized instruction. This cost is the cost of 5990 // the instruction as if it wasn't if-converted and instead remained in the 5991 // predicated block. We will scale this cost by block probability after 5992 // computing the scalarization overhead. 5993 assert(!VF.isScalable() && "scalable vectors not yet supported."); 5994 unsigned ScalarCost = 5995 VF.getKnownMinValue() * 5996 getInstructionCost(I, ElementCount::getFixed(1)).first; 5997 5998 // Compute the scalarization overhead of needed insertelement instructions 5999 // and phi nodes. 6000 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6001 ScalarCost += TTI.getScalarizationOverhead( 6002 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6003 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6004 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6005 ScalarCost += 6006 VF.getKnownMinValue() * 6007 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6008 } 6009 6010 // Compute the scalarization overhead of needed extractelement 6011 // instructions. For each of the instruction's operands, if the operand can 6012 // be scalarized, add it to the worklist; otherwise, account for the 6013 // overhead. 6014 for (Use &U : I->operands()) 6015 if (auto *J = dyn_cast<Instruction>(U.get())) { 6016 assert(VectorType::isValidElementType(J->getType()) && 6017 "Instruction has non-scalar type"); 6018 if (canBeScalarized(J)) 6019 Worklist.push_back(J); 6020 else if (needsExtract(J, VF)) { 6021 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6022 ScalarCost += TTI.getScalarizationOverhead( 6023 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6024 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6025 } 6026 } 6027 6028 // Scale the total scalar cost by block probability. 6029 ScalarCost /= getReciprocalPredBlockProb(); 6030 6031 // Compute the discount. A non-negative discount means the vector version 6032 // of the instruction costs more, and scalarizing would be beneficial. 6033 Discount += VectorCost - ScalarCost; 6034 ScalarCosts[I] = ScalarCost; 6035 } 6036 6037 return Discount; 6038 } 6039 6040 LoopVectorizationCostModel::VectorizationCostTy 6041 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6042 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6043 VectorizationCostTy Cost; 6044 6045 // For each block. 6046 for (BasicBlock *BB : TheLoop->blocks()) { 6047 VectorizationCostTy BlockCost; 6048 6049 // For each instruction in the old loop. 6050 for (Instruction &I : BB->instructionsWithoutDebug()) { 6051 // Skip ignored values. 6052 if (ValuesToIgnore.count(&I) || 6053 (VF.isVector() && VecValuesToIgnore.count(&I))) 6054 continue; 6055 6056 VectorizationCostTy C = getInstructionCost(&I, VF); 6057 6058 // Check if we should override the cost. 6059 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6060 C.first = ForceTargetInstructionCost; 6061 6062 BlockCost.first += C.first; 6063 BlockCost.second |= C.second; 6064 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6065 << " for VF " << VF << " For instruction: " << I 6066 << '\n'); 6067 } 6068 6069 // If we are vectorizing a predicated block, it will have been 6070 // if-converted. This means that the block's instructions (aside from 6071 // stores and instructions that may divide by zero) will now be 6072 // unconditionally executed. For the scalar case, we may not always execute 6073 // the predicated block. Thus, scale the block's cost by the probability of 6074 // executing it. 6075 if (VF.isScalar() && blockNeedsPredication(BB)) 6076 BlockCost.first /= getReciprocalPredBlockProb(); 6077 6078 Cost.first += BlockCost.first; 6079 Cost.second |= BlockCost.second; 6080 } 6081 6082 return Cost; 6083 } 6084 6085 /// Gets Address Access SCEV after verifying that the access pattern 6086 /// is loop invariant except the induction variable dependence. 6087 /// 6088 /// This SCEV can be sent to the Target in order to estimate the address 6089 /// calculation cost. 6090 static const SCEV *getAddressAccessSCEV( 6091 Value *Ptr, 6092 LoopVectorizationLegality *Legal, 6093 PredicatedScalarEvolution &PSE, 6094 const Loop *TheLoop) { 6095 6096 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6097 if (!Gep) 6098 return nullptr; 6099 6100 // We are looking for a gep with all loop invariant indices except for one 6101 // which should be an induction variable. 6102 auto SE = PSE.getSE(); 6103 unsigned NumOperands = Gep->getNumOperands(); 6104 for (unsigned i = 1; i < NumOperands; ++i) { 6105 Value *Opd = Gep->getOperand(i); 6106 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6107 !Legal->isInductionVariable(Opd)) 6108 return nullptr; 6109 } 6110 6111 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6112 return PSE.getSCEV(Ptr); 6113 } 6114 6115 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6116 return Legal->hasStride(I->getOperand(0)) || 6117 Legal->hasStride(I->getOperand(1)); 6118 } 6119 6120 unsigned 6121 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6122 ElementCount VF) { 6123 assert(VF.isVector() && 6124 "Scalarization cost of instruction implies vectorization."); 6125 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6126 Type *ValTy = getMemInstValueType(I); 6127 auto SE = PSE.getSE(); 6128 6129 unsigned AS = getLoadStoreAddressSpace(I); 6130 Value *Ptr = getLoadStorePointerOperand(I); 6131 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6132 6133 // Figure out whether the access is strided and get the stride value 6134 // if it's known in compile time 6135 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6136 6137 // Get the cost of the scalar memory instruction and address computation. 6138 unsigned Cost = 6139 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6140 6141 // Don't pass *I here, since it is scalar but will actually be part of a 6142 // vectorized loop where the user of it is a vectorized instruction. 6143 const Align Alignment = getLoadStoreAlignment(I); 6144 Cost += VF.getKnownMinValue() * 6145 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6146 AS, TTI::TCK_RecipThroughput); 6147 6148 // Get the overhead of the extractelement and insertelement instructions 6149 // we might create due to scalarization. 6150 Cost += getScalarizationOverhead(I, VF); 6151 6152 // If we have a predicated store, it may not be executed for each vector 6153 // lane. Scale the cost by the probability of executing the predicated 6154 // block. 6155 if (isPredicatedInst(I)) { 6156 Cost /= getReciprocalPredBlockProb(); 6157 6158 if (useEmulatedMaskMemRefHack(I)) 6159 // Artificially setting to a high enough value to practically disable 6160 // vectorization with such operations. 6161 Cost = 3000000; 6162 } 6163 6164 return Cost; 6165 } 6166 6167 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6168 ElementCount VF) { 6169 Type *ValTy = getMemInstValueType(I); 6170 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6171 Value *Ptr = getLoadStorePointerOperand(I); 6172 unsigned AS = getLoadStoreAddressSpace(I); 6173 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6174 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6175 6176 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6177 "Stride should be 1 or -1 for consecutive memory access"); 6178 const Align Alignment = getLoadStoreAlignment(I); 6179 unsigned Cost = 0; 6180 if (Legal->isMaskRequired(I)) 6181 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6182 CostKind); 6183 else 6184 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6185 CostKind, I); 6186 6187 bool Reverse = ConsecutiveStride < 0; 6188 if (Reverse) 6189 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6190 return Cost; 6191 } 6192 6193 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6194 ElementCount VF) { 6195 Type *ValTy = getMemInstValueType(I); 6196 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6197 const Align Alignment = getLoadStoreAlignment(I); 6198 unsigned AS = getLoadStoreAddressSpace(I); 6199 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6200 if (isa<LoadInst>(I)) { 6201 return TTI.getAddressComputationCost(ValTy) + 6202 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6203 CostKind) + 6204 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6205 } 6206 StoreInst *SI = cast<StoreInst>(I); 6207 6208 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6209 return TTI.getAddressComputationCost(ValTy) + 6210 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6211 CostKind) + 6212 (isLoopInvariantStoreValue 6213 ? 0 6214 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6215 VF.getKnownMinValue() - 1)); 6216 } 6217 6218 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6219 ElementCount VF) { 6220 Type *ValTy = getMemInstValueType(I); 6221 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6222 const Align Alignment = getLoadStoreAlignment(I); 6223 const Value *Ptr = getLoadStorePointerOperand(I); 6224 6225 return TTI.getAddressComputationCost(VectorTy) + 6226 TTI.getGatherScatterOpCost( 6227 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6228 TargetTransformInfo::TCK_RecipThroughput, I); 6229 } 6230 6231 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6232 ElementCount VF) { 6233 Type *ValTy = getMemInstValueType(I); 6234 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6235 unsigned AS = getLoadStoreAddressSpace(I); 6236 6237 auto Group = getInterleavedAccessGroup(I); 6238 assert(Group && "Fail to get an interleaved access group."); 6239 6240 unsigned InterleaveFactor = Group->getFactor(); 6241 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6242 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6243 6244 // Holds the indices of existing members in an interleaved load group. 6245 // An interleaved store group doesn't need this as it doesn't allow gaps. 6246 SmallVector<unsigned, 4> Indices; 6247 if (isa<LoadInst>(I)) { 6248 for (unsigned i = 0; i < InterleaveFactor; i++) 6249 if (Group->getMember(i)) 6250 Indices.push_back(i); 6251 } 6252 6253 // Calculate the cost of the whole interleaved group. 6254 bool UseMaskForGaps = 6255 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6256 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6257 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6258 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6259 6260 if (Group->isReverse()) { 6261 // TODO: Add support for reversed masked interleaved access. 6262 assert(!Legal->isMaskRequired(I) && 6263 "Reverse masked interleaved access not supported."); 6264 Cost += Group->getNumMembers() * 6265 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6266 } 6267 return Cost; 6268 } 6269 6270 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6271 ElementCount VF) { 6272 // Calculate scalar cost only. Vectorization cost should be ready at this 6273 // moment. 6274 if (VF.isScalar()) { 6275 Type *ValTy = getMemInstValueType(I); 6276 const Align Alignment = getLoadStoreAlignment(I); 6277 unsigned AS = getLoadStoreAddressSpace(I); 6278 6279 return TTI.getAddressComputationCost(ValTy) + 6280 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6281 TTI::TCK_RecipThroughput, I); 6282 } 6283 return getWideningCost(I, VF); 6284 } 6285 6286 LoopVectorizationCostModel::VectorizationCostTy 6287 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6288 ElementCount VF) { 6289 assert(!VF.isScalable() && 6290 "the cost model is not yet implemented for scalable vectorization"); 6291 // If we know that this instruction will remain uniform, check the cost of 6292 // the scalar version. 6293 if (isUniformAfterVectorization(I, VF)) 6294 VF = ElementCount::getFixed(1); 6295 6296 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6297 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6298 6299 // Forced scalars do not have any scalarization overhead. 6300 auto ForcedScalar = ForcedScalars.find(VF); 6301 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6302 auto InstSet = ForcedScalar->second; 6303 if (InstSet.count(I)) 6304 return VectorizationCostTy( 6305 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6306 VF.getKnownMinValue()), 6307 false); 6308 } 6309 6310 Type *VectorTy; 6311 unsigned C = getInstructionCost(I, VF, VectorTy); 6312 6313 bool TypeNotScalarized = 6314 VF.isVector() && VectorTy->isVectorTy() && 6315 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6316 return VectorizationCostTy(C, TypeNotScalarized); 6317 } 6318 6319 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6320 ElementCount VF) { 6321 6322 assert(!VF.isScalable() && 6323 "cannot compute scalarization overhead for scalable vectorization"); 6324 if (VF.isScalar()) 6325 return 0; 6326 6327 unsigned Cost = 0; 6328 Type *RetTy = ToVectorTy(I->getType(), VF); 6329 if (!RetTy->isVoidTy() && 6330 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6331 Cost += TTI.getScalarizationOverhead( 6332 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6333 true, false); 6334 6335 // Some targets keep addresses scalar. 6336 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6337 return Cost; 6338 6339 // Some targets support efficient element stores. 6340 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6341 return Cost; 6342 6343 // Collect operands to consider. 6344 CallInst *CI = dyn_cast<CallInst>(I); 6345 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6346 6347 // Skip operands that do not require extraction/scalarization and do not incur 6348 // any overhead. 6349 return Cost + TTI.getOperandsScalarizationOverhead( 6350 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6351 } 6352 6353 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6354 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6355 if (VF.isScalar()) 6356 return; 6357 NumPredStores = 0; 6358 for (BasicBlock *BB : TheLoop->blocks()) { 6359 // For each instruction in the old loop. 6360 for (Instruction &I : *BB) { 6361 Value *Ptr = getLoadStorePointerOperand(&I); 6362 if (!Ptr) 6363 continue; 6364 6365 // TODO: We should generate better code and update the cost model for 6366 // predicated uniform stores. Today they are treated as any other 6367 // predicated store (see added test cases in 6368 // invariant-store-vectorization.ll). 6369 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6370 NumPredStores++; 6371 6372 if (Legal->isUniform(Ptr) && 6373 // Conditional loads and stores should be scalarized and predicated. 6374 // isScalarWithPredication cannot be used here since masked 6375 // gather/scatters are not considered scalar with predication. 6376 !Legal->blockNeedsPredication(I.getParent())) { 6377 // TODO: Avoid replicating loads and stores instead of 6378 // relying on instcombine to remove them. 6379 // Load: Scalar load + broadcast 6380 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6381 unsigned Cost = getUniformMemOpCost(&I, VF); 6382 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6383 continue; 6384 } 6385 6386 // We assume that widening is the best solution when possible. 6387 if (memoryInstructionCanBeWidened(&I, VF)) { 6388 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6389 int ConsecutiveStride = 6390 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6391 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6392 "Expected consecutive stride."); 6393 InstWidening Decision = 6394 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6395 setWideningDecision(&I, VF, Decision, Cost); 6396 continue; 6397 } 6398 6399 // Choose between Interleaving, Gather/Scatter or Scalarization. 6400 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6401 unsigned NumAccesses = 1; 6402 if (isAccessInterleaved(&I)) { 6403 auto Group = getInterleavedAccessGroup(&I); 6404 assert(Group && "Fail to get an interleaved access group."); 6405 6406 // Make one decision for the whole group. 6407 if (getWideningDecision(&I, VF) != CM_Unknown) 6408 continue; 6409 6410 NumAccesses = Group->getNumMembers(); 6411 if (interleavedAccessCanBeWidened(&I, VF)) 6412 InterleaveCost = getInterleaveGroupCost(&I, VF); 6413 } 6414 6415 unsigned GatherScatterCost = 6416 isLegalGatherOrScatter(&I) 6417 ? getGatherScatterCost(&I, VF) * NumAccesses 6418 : std::numeric_limits<unsigned>::max(); 6419 6420 unsigned ScalarizationCost = 6421 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6422 6423 // Choose better solution for the current VF, 6424 // write down this decision and use it during vectorization. 6425 unsigned Cost; 6426 InstWidening Decision; 6427 if (InterleaveCost <= GatherScatterCost && 6428 InterleaveCost < ScalarizationCost) { 6429 Decision = CM_Interleave; 6430 Cost = InterleaveCost; 6431 } else if (GatherScatterCost < ScalarizationCost) { 6432 Decision = CM_GatherScatter; 6433 Cost = GatherScatterCost; 6434 } else { 6435 Decision = CM_Scalarize; 6436 Cost = ScalarizationCost; 6437 } 6438 // If the instructions belongs to an interleave group, the whole group 6439 // receives the same decision. The whole group receives the cost, but 6440 // the cost will actually be assigned to one instruction. 6441 if (auto Group = getInterleavedAccessGroup(&I)) 6442 setWideningDecision(Group, VF, Decision, Cost); 6443 else 6444 setWideningDecision(&I, VF, Decision, Cost); 6445 } 6446 } 6447 6448 // Make sure that any load of address and any other address computation 6449 // remains scalar unless there is gather/scatter support. This avoids 6450 // inevitable extracts into address registers, and also has the benefit of 6451 // activating LSR more, since that pass can't optimize vectorized 6452 // addresses. 6453 if (TTI.prefersVectorizedAddressing()) 6454 return; 6455 6456 // Start with all scalar pointer uses. 6457 SmallPtrSet<Instruction *, 8> AddrDefs; 6458 for (BasicBlock *BB : TheLoop->blocks()) 6459 for (Instruction &I : *BB) { 6460 Instruction *PtrDef = 6461 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6462 if (PtrDef && TheLoop->contains(PtrDef) && 6463 getWideningDecision(&I, VF) != CM_GatherScatter) 6464 AddrDefs.insert(PtrDef); 6465 } 6466 6467 // Add all instructions used to generate the addresses. 6468 SmallVector<Instruction *, 4> Worklist; 6469 for (auto *I : AddrDefs) 6470 Worklist.push_back(I); 6471 while (!Worklist.empty()) { 6472 Instruction *I = Worklist.pop_back_val(); 6473 for (auto &Op : I->operands()) 6474 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6475 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6476 AddrDefs.insert(InstOp).second) 6477 Worklist.push_back(InstOp); 6478 } 6479 6480 for (auto *I : AddrDefs) { 6481 if (isa<LoadInst>(I)) { 6482 // Setting the desired widening decision should ideally be handled in 6483 // by cost functions, but since this involves the task of finding out 6484 // if the loaded register is involved in an address computation, it is 6485 // instead changed here when we know this is the case. 6486 InstWidening Decision = getWideningDecision(I, VF); 6487 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6488 // Scalarize a widened load of address. 6489 setWideningDecision( 6490 I, VF, CM_Scalarize, 6491 (VF.getKnownMinValue() * 6492 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6493 else if (auto Group = getInterleavedAccessGroup(I)) { 6494 // Scalarize an interleave group of address loads. 6495 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6496 if (Instruction *Member = Group->getMember(I)) 6497 setWideningDecision( 6498 Member, VF, CM_Scalarize, 6499 (VF.getKnownMinValue() * 6500 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6501 } 6502 } 6503 } else 6504 // Make sure I gets scalarized and a cost estimate without 6505 // scalarization overhead. 6506 ForcedScalars[VF].insert(I); 6507 } 6508 } 6509 6510 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6511 ElementCount VF, 6512 Type *&VectorTy) { 6513 Type *RetTy = I->getType(); 6514 if (canTruncateToMinimalBitwidth(I, VF)) 6515 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6516 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6517 auto SE = PSE.getSE(); 6518 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6519 6520 // TODO: We need to estimate the cost of intrinsic calls. 6521 switch (I->getOpcode()) { 6522 case Instruction::GetElementPtr: 6523 // We mark this instruction as zero-cost because the cost of GEPs in 6524 // vectorized code depends on whether the corresponding memory instruction 6525 // is scalarized or not. Therefore, we handle GEPs with the memory 6526 // instruction cost. 6527 return 0; 6528 case Instruction::Br: { 6529 // In cases of scalarized and predicated instructions, there will be VF 6530 // predicated blocks in the vectorized loop. Each branch around these 6531 // blocks requires also an extract of its vector compare i1 element. 6532 bool ScalarPredicatedBB = false; 6533 BranchInst *BI = cast<BranchInst>(I); 6534 if (VF.isVector() && BI->isConditional() && 6535 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6536 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6537 ScalarPredicatedBB = true; 6538 6539 if (ScalarPredicatedBB) { 6540 // Return cost for branches around scalarized and predicated blocks. 6541 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6542 auto *Vec_i1Ty = 6543 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6544 return (TTI.getScalarizationOverhead( 6545 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6546 false, true) + 6547 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6548 VF.getKnownMinValue())); 6549 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6550 // The back-edge branch will remain, as will all scalar branches. 6551 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6552 else 6553 // This branch will be eliminated by if-conversion. 6554 return 0; 6555 // Note: We currently assume zero cost for an unconditional branch inside 6556 // a predicated block since it will become a fall-through, although we 6557 // may decide in the future to call TTI for all branches. 6558 } 6559 case Instruction::PHI: { 6560 auto *Phi = cast<PHINode>(I); 6561 6562 // First-order recurrences are replaced by vector shuffles inside the loop. 6563 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6564 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6565 return TTI.getShuffleCost( 6566 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6567 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6568 6569 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6570 // converted into select instructions. We require N - 1 selects per phi 6571 // node, where N is the number of incoming values. 6572 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6573 return (Phi->getNumIncomingValues() - 1) * 6574 TTI.getCmpSelInstrCost( 6575 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6576 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6577 CostKind); 6578 6579 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6580 } 6581 case Instruction::UDiv: 6582 case Instruction::SDiv: 6583 case Instruction::URem: 6584 case Instruction::SRem: 6585 // If we have a predicated instruction, it may not be executed for each 6586 // vector lane. Get the scalarization cost and scale this amount by the 6587 // probability of executing the predicated block. If the instruction is not 6588 // predicated, we fall through to the next case. 6589 if (VF.isVector() && isScalarWithPredication(I)) { 6590 unsigned Cost = 0; 6591 6592 // These instructions have a non-void type, so account for the phi nodes 6593 // that we will create. This cost is likely to be zero. The phi node 6594 // cost, if any, should be scaled by the block probability because it 6595 // models a copy at the end of each predicated block. 6596 Cost += VF.getKnownMinValue() * 6597 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6598 6599 // The cost of the non-predicated instruction. 6600 Cost += VF.getKnownMinValue() * 6601 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6602 6603 // The cost of insertelement and extractelement instructions needed for 6604 // scalarization. 6605 Cost += getScalarizationOverhead(I, VF); 6606 6607 // Scale the cost by the probability of executing the predicated blocks. 6608 // This assumes the predicated block for each vector lane is equally 6609 // likely. 6610 return Cost / getReciprocalPredBlockProb(); 6611 } 6612 LLVM_FALLTHROUGH; 6613 case Instruction::Add: 6614 case Instruction::FAdd: 6615 case Instruction::Sub: 6616 case Instruction::FSub: 6617 case Instruction::Mul: 6618 case Instruction::FMul: 6619 case Instruction::FDiv: 6620 case Instruction::FRem: 6621 case Instruction::Shl: 6622 case Instruction::LShr: 6623 case Instruction::AShr: 6624 case Instruction::And: 6625 case Instruction::Or: 6626 case Instruction::Xor: { 6627 // Since we will replace the stride by 1 the multiplication should go away. 6628 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6629 return 0; 6630 // Certain instructions can be cheaper to vectorize if they have a constant 6631 // second vector operand. One example of this are shifts on x86. 6632 Value *Op2 = I->getOperand(1); 6633 TargetTransformInfo::OperandValueProperties Op2VP; 6634 TargetTransformInfo::OperandValueKind Op2VK = 6635 TTI.getOperandInfo(Op2, Op2VP); 6636 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6637 Op2VK = TargetTransformInfo::OK_UniformValue; 6638 6639 SmallVector<const Value *, 4> Operands(I->operand_values()); 6640 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6641 return N * TTI.getArithmeticInstrCost( 6642 I->getOpcode(), VectorTy, CostKind, 6643 TargetTransformInfo::OK_AnyValue, 6644 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6645 } 6646 case Instruction::FNeg: { 6647 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6648 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6649 return N * TTI.getArithmeticInstrCost( 6650 I->getOpcode(), VectorTy, CostKind, 6651 TargetTransformInfo::OK_AnyValue, 6652 TargetTransformInfo::OK_AnyValue, 6653 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6654 I->getOperand(0), I); 6655 } 6656 case Instruction::Select: { 6657 SelectInst *SI = cast<SelectInst>(I); 6658 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6659 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6660 Type *CondTy = SI->getCondition()->getType(); 6661 if (!ScalarCond) { 6662 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 6663 CondTy = VectorType::get(CondTy, VF); 6664 } 6665 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 6666 CostKind, I); 6667 } 6668 case Instruction::ICmp: 6669 case Instruction::FCmp: { 6670 Type *ValTy = I->getOperand(0)->getType(); 6671 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6672 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6673 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6674 VectorTy = ToVectorTy(ValTy, VF); 6675 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind, 6676 I); 6677 } 6678 case Instruction::Store: 6679 case Instruction::Load: { 6680 ElementCount Width = VF; 6681 if (Width.isVector()) { 6682 InstWidening Decision = getWideningDecision(I, Width); 6683 assert(Decision != CM_Unknown && 6684 "CM decision should be taken at this point"); 6685 if (Decision == CM_Scalarize) 6686 Width = ElementCount::getFixed(1); 6687 } 6688 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6689 return getMemoryInstructionCost(I, VF); 6690 } 6691 case Instruction::ZExt: 6692 case Instruction::SExt: 6693 case Instruction::FPToUI: 6694 case Instruction::FPToSI: 6695 case Instruction::FPExt: 6696 case Instruction::PtrToInt: 6697 case Instruction::IntToPtr: 6698 case Instruction::SIToFP: 6699 case Instruction::UIToFP: 6700 case Instruction::Trunc: 6701 case Instruction::FPTrunc: 6702 case Instruction::BitCast: { 6703 // Computes the CastContextHint from a Load/Store instruction. 6704 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 6705 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 6706 "Expected a load or a store!"); 6707 6708 if (VF.isScalar() || !TheLoop->contains(I)) 6709 return TTI::CastContextHint::Normal; 6710 6711 switch (getWideningDecision(I, VF)) { 6712 case LoopVectorizationCostModel::CM_GatherScatter: 6713 return TTI::CastContextHint::GatherScatter; 6714 case LoopVectorizationCostModel::CM_Interleave: 6715 return TTI::CastContextHint::Interleave; 6716 case LoopVectorizationCostModel::CM_Scalarize: 6717 case LoopVectorizationCostModel::CM_Widen: 6718 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 6719 : TTI::CastContextHint::Normal; 6720 case LoopVectorizationCostModel::CM_Widen_Reverse: 6721 return TTI::CastContextHint::Reversed; 6722 case LoopVectorizationCostModel::CM_Unknown: 6723 llvm_unreachable("Instr did not go through cost modelling?"); 6724 } 6725 6726 llvm_unreachable("Unhandled case!"); 6727 }; 6728 6729 unsigned Opcode = I->getOpcode(); 6730 TTI::CastContextHint CCH = TTI::CastContextHint::None; 6731 // For Trunc, the context is the only user, which must be a StoreInst. 6732 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 6733 if (I->hasOneUse()) 6734 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 6735 CCH = ComputeCCH(Store); 6736 } 6737 // For Z/Sext, the context is the operand, which must be a LoadInst. 6738 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 6739 Opcode == Instruction::FPExt) { 6740 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 6741 CCH = ComputeCCH(Load); 6742 } 6743 6744 // We optimize the truncation of induction variables having constant 6745 // integer steps. The cost of these truncations is the same as the scalar 6746 // operation. 6747 if (isOptimizableIVTruncate(I, VF)) { 6748 auto *Trunc = cast<TruncInst>(I); 6749 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6750 Trunc->getSrcTy(), CCH, CostKind, Trunc); 6751 } 6752 6753 Type *SrcScalarTy = I->getOperand(0)->getType(); 6754 Type *SrcVecTy = 6755 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6756 if (canTruncateToMinimalBitwidth(I, VF)) { 6757 // This cast is going to be shrunk. This may remove the cast or it might 6758 // turn it into slightly different cast. For example, if MinBW == 16, 6759 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6760 // 6761 // Calculate the modified src and dest types. 6762 Type *MinVecTy = VectorTy; 6763 if (Opcode == Instruction::Trunc) { 6764 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6765 VectorTy = 6766 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6767 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 6768 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6769 VectorTy = 6770 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6771 } 6772 } 6773 6774 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 6775 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 6776 return N * 6777 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 6778 } 6779 case Instruction::Call: { 6780 bool NeedToScalarize; 6781 CallInst *CI = cast<CallInst>(I); 6782 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6783 if (getVectorIntrinsicIDForCall(CI, TLI)) 6784 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6785 return CallCost; 6786 } 6787 default: 6788 // The cost of executing VF copies of the scalar instruction. This opcode 6789 // is unknown. Assume that it is the same as 'mul'. 6790 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 6791 Instruction::Mul, VectorTy, CostKind) + 6792 getScalarizationOverhead(I, VF); 6793 } // end of switch. 6794 } 6795 6796 char LoopVectorize::ID = 0; 6797 6798 static const char lv_name[] = "Loop Vectorization"; 6799 6800 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6801 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6802 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6803 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6804 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6805 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6806 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6807 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6808 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6809 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6810 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6811 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6812 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6813 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6814 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6815 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6816 6817 namespace llvm { 6818 6819 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6820 6821 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6822 bool VectorizeOnlyWhenForced) { 6823 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6824 } 6825 6826 } // end namespace llvm 6827 6828 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6829 // Check if the pointer operand of a load or store instruction is 6830 // consecutive. 6831 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6832 return Legal->isConsecutivePtr(Ptr); 6833 return false; 6834 } 6835 6836 void LoopVectorizationCostModel::collectValuesToIgnore() { 6837 // Ignore ephemeral values. 6838 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6839 6840 // Ignore type-promoting instructions we identified during reduction 6841 // detection. 6842 for (auto &Reduction : Legal->getReductionVars()) { 6843 RecurrenceDescriptor &RedDes = Reduction.second; 6844 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6845 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6846 } 6847 // Ignore type-casting instructions we identified during induction 6848 // detection. 6849 for (auto &Induction : Legal->getInductionVars()) { 6850 InductionDescriptor &IndDes = Induction.second; 6851 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6852 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6853 } 6854 } 6855 6856 void LoopVectorizationCostModel::collectInLoopReductions() { 6857 // For the moment, without predicated reduction instructions, we do not 6858 // support inloop reductions whilst folding the tail, and hence in those cases 6859 // all reductions are currently out of the loop. 6860 if (!PreferInLoopReductions || foldTailByMasking()) 6861 return; 6862 6863 for (auto &Reduction : Legal->getReductionVars()) { 6864 PHINode *Phi = Reduction.first; 6865 RecurrenceDescriptor &RdxDesc = Reduction.second; 6866 6867 // We don't collect reductions that are type promoted (yet). 6868 if (RdxDesc.getRecurrenceType() != Phi->getType()) 6869 continue; 6870 6871 // Check that we can correctly put the reductions into the loop, by 6872 // finding the chain of operations that leads from the phi to the loop 6873 // exit value. 6874 SmallVector<Instruction *, 4> ReductionOperations = 6875 RdxDesc.getReductionOpChain(Phi, TheLoop); 6876 bool InLoop = !ReductionOperations.empty(); 6877 if (InLoop) 6878 InLoopReductionChains[Phi] = ReductionOperations; 6879 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 6880 << " reduction for phi: " << *Phi << "\n"); 6881 } 6882 } 6883 6884 // TODO: we could return a pair of values that specify the max VF and 6885 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6886 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6887 // doesn't have a cost model that can choose which plan to execute if 6888 // more than one is generated. 6889 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6890 LoopVectorizationCostModel &CM) { 6891 unsigned WidestType; 6892 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6893 return WidestVectorRegBits / WidestType; 6894 } 6895 6896 VectorizationFactor 6897 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 6898 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 6899 ElementCount VF = UserVF; 6900 // Outer loop handling: They may require CFG and instruction level 6901 // transformations before even evaluating whether vectorization is profitable. 6902 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6903 // the vectorization pipeline. 6904 if (!OrigLoop->empty()) { 6905 // If the user doesn't provide a vectorization factor, determine a 6906 // reasonable one. 6907 if (UserVF.isZero()) { 6908 VF = ElementCount::getFixed( 6909 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 6910 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6911 6912 // Make sure we have a VF > 1 for stress testing. 6913 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 6914 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6915 << "overriding computed VF.\n"); 6916 VF = ElementCount::getFixed(4); 6917 } 6918 } 6919 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6920 assert(isPowerOf2_32(VF.getKnownMinValue()) && 6921 "VF needs to be a power of two"); 6922 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 6923 << "VF " << VF << " to build VPlans.\n"); 6924 buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); 6925 6926 // For VPlan build stress testing, we bail out after VPlan construction. 6927 if (VPlanBuildStressTest) 6928 return VectorizationFactor::Disabled(); 6929 6930 return {VF, 0 /*Cost*/}; 6931 } 6932 6933 LLVM_DEBUG( 6934 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6935 "VPlan-native path.\n"); 6936 return VectorizationFactor::Disabled(); 6937 } 6938 6939 Optional<VectorizationFactor> 6940 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 6941 assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); 6942 assert(OrigLoop->empty() && "Inner loop expected."); 6943 Optional<unsigned> MaybeMaxVF = 6944 CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); 6945 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6946 return None; 6947 6948 // Invalidate interleave groups if all blocks of loop will be predicated. 6949 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6950 !useMaskedInterleavedAccesses(*TTI)) { 6951 LLVM_DEBUG( 6952 dbgs() 6953 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6954 "which requires masked-interleaved support.\n"); 6955 if (CM.InterleaveInfo.invalidateGroups()) 6956 // Invalidating interleave groups also requires invalidating all decisions 6957 // based on them, which includes widening decisions and uniform and scalar 6958 // values. 6959 CM.invalidateCostModelingDecisions(); 6960 } 6961 6962 if (!UserVF.isZero()) { 6963 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6964 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 6965 "VF needs to be a power of two"); 6966 // Collect the instructions (and their associated costs) that will be more 6967 // profitable to scalarize. 6968 CM.selectUserVectorizationFactor(UserVF); 6969 CM.collectInLoopReductions(); 6970 buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), 6971 UserVF.getKnownMinValue()); 6972 LLVM_DEBUG(printPlans(dbgs())); 6973 return {{UserVF, 0}}; 6974 } 6975 6976 unsigned MaxVF = MaybeMaxVF.getValue(); 6977 assert(MaxVF != 0 && "MaxVF is zero."); 6978 6979 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6980 // Collect Uniform and Scalar instructions after vectorization with VF. 6981 CM.collectUniformsAndScalars(ElementCount::getFixed(VF)); 6982 6983 // Collect the instructions (and their associated costs) that will be more 6984 // profitable to scalarize. 6985 if (VF > 1) 6986 CM.collectInstsToScalarize(ElementCount::getFixed(VF)); 6987 } 6988 6989 CM.collectInLoopReductions(); 6990 6991 buildVPlansWithVPRecipes(1, MaxVF); 6992 LLVM_DEBUG(printPlans(dbgs())); 6993 if (MaxVF == 1) 6994 return VectorizationFactor::Disabled(); 6995 6996 // Select the optimal vectorization factor. 6997 return CM.selectVectorizationFactor(MaxVF); 6998 } 6999 7000 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7001 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7002 << '\n'); 7003 BestVF = VF; 7004 BestUF = UF; 7005 7006 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7007 return !Plan->hasVF(VF); 7008 }); 7009 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7010 } 7011 7012 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7013 DominatorTree *DT) { 7014 // Perform the actual loop transformation. 7015 7016 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7017 VPCallbackILV CallbackILV(ILV); 7018 7019 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7020 7021 VPTransformState State{*BestVF, BestUF, LI, 7022 DT, ILV.Builder, ILV.VectorLoopValueMap, 7023 &ILV, CallbackILV}; 7024 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7025 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7026 State.CanonicalIV = ILV.Induction; 7027 7028 //===------------------------------------------------===// 7029 // 7030 // Notice: any optimization or new instruction that go 7031 // into the code below should also be implemented in 7032 // the cost-model. 7033 // 7034 //===------------------------------------------------===// 7035 7036 // 2. Copy and widen instructions from the old loop into the new loop. 7037 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7038 VPlans.front()->execute(&State); 7039 7040 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7041 // predication, updating analyses. 7042 ILV.fixVectorizedLoop(); 7043 } 7044 7045 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7046 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7047 BasicBlock *Latch = OrigLoop->getLoopLatch(); 7048 7049 // We create new control-flow for the vectorized loop, so the original 7050 // condition will be dead after vectorization if it's only used by the 7051 // branch. 7052 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 7053 if (Cmp && Cmp->hasOneUse()) 7054 DeadInstructions.insert(Cmp); 7055 7056 // We create new "steps" for induction variable updates to which the original 7057 // induction variables map. An original update instruction will be dead if 7058 // all its users except the induction variable are dead. 7059 for (auto &Induction : Legal->getInductionVars()) { 7060 PHINode *Ind = Induction.first; 7061 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7062 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7063 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7064 })) 7065 DeadInstructions.insert(IndUpdate); 7066 7067 // We record as "Dead" also the type-casting instructions we had identified 7068 // during induction analysis. We don't need any handling for them in the 7069 // vectorized loop because we have proven that, under a proper runtime 7070 // test guarding the vectorized loop, the value of the phi, and the casted 7071 // value of the phi, are the same. The last instruction in this casting chain 7072 // will get its scalar/vector/widened def from the scalar/vector/widened def 7073 // of the respective phi node. Any other casts in the induction def-use chain 7074 // have no other uses outside the phi update chain, and will be ignored. 7075 InductionDescriptor &IndDes = Induction.second; 7076 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7077 DeadInstructions.insert(Casts.begin(), Casts.end()); 7078 } 7079 } 7080 7081 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7082 7083 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7084 7085 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7086 Instruction::BinaryOps BinOp) { 7087 // When unrolling and the VF is 1, we only need to add a simple scalar. 7088 Type *Ty = Val->getType(); 7089 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7090 7091 if (Ty->isFloatingPointTy()) { 7092 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7093 7094 // Floating point operations had to be 'fast' to enable the unrolling. 7095 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7096 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7097 } 7098 Constant *C = ConstantInt::get(Ty, StartIdx); 7099 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7100 } 7101 7102 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7103 SmallVector<Metadata *, 4> MDs; 7104 // Reserve first location for self reference to the LoopID metadata node. 7105 MDs.push_back(nullptr); 7106 bool IsUnrollMetadata = false; 7107 MDNode *LoopID = L->getLoopID(); 7108 if (LoopID) { 7109 // First find existing loop unrolling disable metadata. 7110 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7111 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7112 if (MD) { 7113 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7114 IsUnrollMetadata = 7115 S && S->getString().startswith("llvm.loop.unroll.disable"); 7116 } 7117 MDs.push_back(LoopID->getOperand(i)); 7118 } 7119 } 7120 7121 if (!IsUnrollMetadata) { 7122 // Add runtime unroll disable metadata. 7123 LLVMContext &Context = L->getHeader()->getContext(); 7124 SmallVector<Metadata *, 1> DisableOperands; 7125 DisableOperands.push_back( 7126 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7127 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7128 MDs.push_back(DisableNode); 7129 MDNode *NewLoopID = MDNode::get(Context, MDs); 7130 // Set operand 0 to refer to the loop id itself. 7131 NewLoopID->replaceOperandWith(0, NewLoopID); 7132 L->setLoopID(NewLoopID); 7133 } 7134 } 7135 7136 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7137 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7138 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 7139 bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start)); 7140 7141 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 7142 if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) { 7143 Range.End = TmpVF; 7144 break; 7145 } 7146 7147 return PredicateAtRangeStart; 7148 } 7149 7150 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7151 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7152 /// of VF's starting at a given VF and extending it as much as possible. Each 7153 /// vectorization decision can potentially shorten this sub-range during 7154 /// buildVPlan(). 7155 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 7156 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7157 VFRange SubRange = {VF, MaxVF + 1}; 7158 VPlans.push_back(buildVPlan(SubRange)); 7159 VF = SubRange.End; 7160 } 7161 } 7162 7163 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7164 VPlanPtr &Plan) { 7165 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7166 7167 // Look for cached value. 7168 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7169 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7170 if (ECEntryIt != EdgeMaskCache.end()) 7171 return ECEntryIt->second; 7172 7173 VPValue *SrcMask = createBlockInMask(Src, Plan); 7174 7175 // The terminator has to be a branch inst! 7176 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7177 assert(BI && "Unexpected terminator found"); 7178 7179 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7180 return EdgeMaskCache[Edge] = SrcMask; 7181 7182 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 7183 assert(EdgeMask && "No Edge Mask found for condition"); 7184 7185 if (BI->getSuccessor(0) != Dst) 7186 EdgeMask = Builder.createNot(EdgeMask); 7187 7188 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7189 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7190 7191 return EdgeMaskCache[Edge] = EdgeMask; 7192 } 7193 7194 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7195 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7196 7197 // Look for cached value. 7198 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7199 if (BCEntryIt != BlockMaskCache.end()) 7200 return BCEntryIt->second; 7201 7202 // All-one mask is modelled as no-mask following the convention for masked 7203 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7204 VPValue *BlockMask = nullptr; 7205 7206 if (OrigLoop->getHeader() == BB) { 7207 if (!CM.blockNeedsPredication(BB)) 7208 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7209 7210 // Introduce the early-exit compare IV <= BTC to form header block mask. 7211 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7212 // Start by constructing the desired canonical IV. 7213 VPValue *IV = nullptr; 7214 if (Legal->getPrimaryInduction()) 7215 IV = Plan->getVPValue(Legal->getPrimaryInduction()); 7216 else { 7217 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7218 Builder.getInsertBlock()->appendRecipe(IVRecipe); 7219 IV = IVRecipe->getVPValue(); 7220 } 7221 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7222 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7223 7224 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7225 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7226 // as a second argument, we only pass the IV here and extract the 7227 // tripcount from the transform state where codegen of the VP instructions 7228 // happen. 7229 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7230 } else { 7231 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7232 } 7233 return BlockMaskCache[BB] = BlockMask; 7234 } 7235 7236 // This is the block mask. We OR all incoming edges. 7237 for (auto *Predecessor : predecessors(BB)) { 7238 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7239 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7240 return BlockMaskCache[BB] = EdgeMask; 7241 7242 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7243 BlockMask = EdgeMask; 7244 continue; 7245 } 7246 7247 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7248 } 7249 7250 return BlockMaskCache[BB] = BlockMask; 7251 } 7252 7253 VPWidenMemoryInstructionRecipe * 7254 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7255 VPlanPtr &Plan) { 7256 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7257 "Must be called with either a load or store"); 7258 7259 auto willWiden = [&](ElementCount VF) -> bool { 7260 assert(!VF.isScalable() && "unexpected scalable ElementCount"); 7261 if (VF.isScalar()) 7262 return false; 7263 LoopVectorizationCostModel::InstWidening Decision = 7264 CM.getWideningDecision(I, VF); 7265 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7266 "CM decision should be taken at this point."); 7267 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7268 return true; 7269 if (CM.isScalarAfterVectorization(I, VF) || 7270 CM.isProfitableToScalarize(I, VF)) 7271 return false; 7272 return Decision != LoopVectorizationCostModel::CM_Scalarize; 7273 }; 7274 7275 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7276 return nullptr; 7277 7278 VPValue *Mask = nullptr; 7279 if (Legal->isMaskRequired(I)) 7280 Mask = createBlockInMask(I->getParent(), Plan); 7281 7282 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 7283 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 7284 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 7285 7286 StoreInst *Store = cast<StoreInst>(I); 7287 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 7288 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 7289 } 7290 7291 VPWidenIntOrFpInductionRecipe * 7292 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 7293 // Check if this is an integer or fp induction. If so, build the recipe that 7294 // produces its scalar and vector values. 7295 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 7296 if (II.getKind() == InductionDescriptor::IK_IntInduction || 7297 II.getKind() == InductionDescriptor::IK_FpInduction) 7298 return new VPWidenIntOrFpInductionRecipe(Phi); 7299 7300 return nullptr; 7301 } 7302 7303 VPWidenIntOrFpInductionRecipe * 7304 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 7305 VFRange &Range) const { 7306 // Optimize the special case where the source is a constant integer 7307 // induction variable. Notice that we can only optimize the 'trunc' case 7308 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 7309 // (c) other casts depend on pointer size. 7310 7311 // Determine whether \p K is a truncation based on an induction variable that 7312 // can be optimized. 7313 auto isOptimizableIVTruncate = 7314 [&](Instruction *K) -> std::function<bool(ElementCount)> { 7315 return [=](ElementCount VF) -> bool { 7316 return CM.isOptimizableIVTruncate(K, VF); 7317 }; 7318 }; 7319 7320 if (LoopVectorizationPlanner::getDecisionAndClampRange( 7321 isOptimizableIVTruncate(I), Range)) 7322 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 7323 I); 7324 return nullptr; 7325 } 7326 7327 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 7328 // We know that all PHIs in non-header blocks are converted into selects, so 7329 // we don't have to worry about the insertion order and we can just use the 7330 // builder. At this point we generate the predication tree. There may be 7331 // duplications since this is a simple recursive scan, but future 7332 // optimizations will clean it up. 7333 7334 SmallVector<VPValue *, 2> Operands; 7335 unsigned NumIncoming = Phi->getNumIncomingValues(); 7336 for (unsigned In = 0; In < NumIncoming; In++) { 7337 VPValue *EdgeMask = 7338 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 7339 assert((EdgeMask || NumIncoming == 1) && 7340 "Multiple predecessors with one having a full mask"); 7341 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 7342 if (EdgeMask) 7343 Operands.push_back(EdgeMask); 7344 } 7345 return new VPBlendRecipe(Phi, Operands); 7346 } 7347 7348 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 7349 VPlan &Plan) const { 7350 7351 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7352 [this, CI](ElementCount VF) { 7353 return CM.isScalarWithPredication(CI, VF); 7354 }, 7355 Range); 7356 7357 if (IsPredicated) 7358 return nullptr; 7359 7360 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7361 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 7362 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 7363 return nullptr; 7364 7365 auto willWiden = [&](ElementCount VF) -> bool { 7366 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 7367 // The following case may be scalarized depending on the VF. 7368 // The flag shows whether we use Intrinsic or a usual Call for vectorized 7369 // version of the instruction. 7370 // Is it beneficial to perform intrinsic call compared to lib call? 7371 bool NeedToScalarize = false; 7372 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 7373 bool UseVectorIntrinsic = 7374 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 7375 return UseVectorIntrinsic || !NeedToScalarize; 7376 }; 7377 7378 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 7379 return nullptr; 7380 7381 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 7382 } 7383 7384 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 7385 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 7386 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 7387 // Instruction should be widened, unless it is scalar after vectorization, 7388 // scalarization is profitable or it is predicated. 7389 auto WillScalarize = [this, I](ElementCount VF) -> bool { 7390 return CM.isScalarAfterVectorization(I, VF) || 7391 CM.isProfitableToScalarize(I, VF) || 7392 CM.isScalarWithPredication(I, VF); 7393 }; 7394 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 7395 Range); 7396 } 7397 7398 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 7399 auto IsVectorizableOpcode = [](unsigned Opcode) { 7400 switch (Opcode) { 7401 case Instruction::Add: 7402 case Instruction::And: 7403 case Instruction::AShr: 7404 case Instruction::BitCast: 7405 case Instruction::FAdd: 7406 case Instruction::FCmp: 7407 case Instruction::FDiv: 7408 case Instruction::FMul: 7409 case Instruction::FNeg: 7410 case Instruction::FPExt: 7411 case Instruction::FPToSI: 7412 case Instruction::FPToUI: 7413 case Instruction::FPTrunc: 7414 case Instruction::FRem: 7415 case Instruction::FSub: 7416 case Instruction::ICmp: 7417 case Instruction::IntToPtr: 7418 case Instruction::LShr: 7419 case Instruction::Mul: 7420 case Instruction::Or: 7421 case Instruction::PtrToInt: 7422 case Instruction::SDiv: 7423 case Instruction::Select: 7424 case Instruction::SExt: 7425 case Instruction::Shl: 7426 case Instruction::SIToFP: 7427 case Instruction::SRem: 7428 case Instruction::Sub: 7429 case Instruction::Trunc: 7430 case Instruction::UDiv: 7431 case Instruction::UIToFP: 7432 case Instruction::URem: 7433 case Instruction::Xor: 7434 case Instruction::ZExt: 7435 return true; 7436 } 7437 return false; 7438 }; 7439 7440 if (!IsVectorizableOpcode(I->getOpcode())) 7441 return nullptr; 7442 7443 // Success: widen this instruction. 7444 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 7445 } 7446 7447 VPBasicBlock *VPRecipeBuilder::handleReplication( 7448 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7449 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7450 VPlanPtr &Plan) { 7451 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7452 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 7453 Range); 7454 7455 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7456 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 7457 Range); 7458 7459 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 7460 IsUniform, IsPredicated); 7461 setRecipe(I, Recipe); 7462 7463 // Find if I uses a predicated instruction. If so, it will use its scalar 7464 // value. Avoid hoisting the insert-element which packs the scalar value into 7465 // a vector value, as that happens iff all users use the vector value. 7466 for (auto &Op : I->operands()) 7467 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7468 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7469 PredInst2Recipe[PredInst]->setAlsoPack(false); 7470 7471 // Finalize the recipe for Instr, first if it is not predicated. 7472 if (!IsPredicated) { 7473 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7474 VPBB->appendRecipe(Recipe); 7475 return VPBB; 7476 } 7477 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7478 assert(VPBB->getSuccessors().empty() && 7479 "VPBB has successors when handling predicated replication."); 7480 // Record predicated instructions for above packing optimizations. 7481 PredInst2Recipe[I] = Recipe; 7482 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7483 VPBlockUtils::insertBlockAfter(Region, VPBB); 7484 auto *RegSucc = new VPBasicBlock(); 7485 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7486 return RegSucc; 7487 } 7488 7489 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7490 VPRecipeBase *PredRecipe, 7491 VPlanPtr &Plan) { 7492 // Instructions marked for predication are replicated and placed under an 7493 // if-then construct to prevent side-effects. 7494 7495 // Generate recipes to compute the block mask for this region. 7496 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7497 7498 // Build the triangular if-then region. 7499 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7500 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7501 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7502 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7503 auto *PHIRecipe = 7504 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7505 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7506 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7507 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7508 7509 // Note: first set Entry as region entry and then connect successors starting 7510 // from it in order, to propagate the "parent" of each VPBasicBlock. 7511 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7512 VPBlockUtils::connectBlocks(Pred, Exit); 7513 7514 return Region; 7515 } 7516 7517 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 7518 VFRange &Range, 7519 VPlanPtr &Plan) { 7520 // First, check for specific widening recipes that deal with calls, memory 7521 // operations, inductions and Phi nodes. 7522 if (auto *CI = dyn_cast<CallInst>(Instr)) 7523 return tryToWidenCall(CI, Range, *Plan); 7524 7525 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 7526 return tryToWidenMemory(Instr, Range, Plan); 7527 7528 VPRecipeBase *Recipe; 7529 if (auto Phi = dyn_cast<PHINode>(Instr)) { 7530 if (Phi->getParent() != OrigLoop->getHeader()) 7531 return tryToBlend(Phi, Plan); 7532 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 7533 return Recipe; 7534 return new VPWidenPHIRecipe(Phi); 7535 } 7536 7537 if (isa<TruncInst>(Instr) && 7538 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 7539 return Recipe; 7540 7541 if (!shouldWiden(Instr, Range)) 7542 return nullptr; 7543 7544 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 7545 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 7546 OrigLoop); 7547 7548 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 7549 bool InvariantCond = 7550 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 7551 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 7552 InvariantCond); 7553 } 7554 7555 return tryToWiden(Instr, *Plan); 7556 } 7557 7558 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7559 unsigned MaxVF) { 7560 assert(OrigLoop->empty() && "Inner loop expected."); 7561 7562 // Collect conditions feeding internal conditional branches; they need to be 7563 // represented in VPlan for it to model masking. 7564 SmallPtrSet<Value *, 1> NeedDef; 7565 7566 auto *Latch = OrigLoop->getLoopLatch(); 7567 for (BasicBlock *BB : OrigLoop->blocks()) { 7568 if (BB == Latch) 7569 continue; 7570 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7571 if (Branch && Branch->isConditional()) 7572 NeedDef.insert(Branch->getCondition()); 7573 } 7574 7575 // If the tail is to be folded by masking, the primary induction variable, if 7576 // exists needs to be represented in VPlan for it to model early-exit masking. 7577 // Also, both the Phi and the live-out instruction of each reduction are 7578 // required in order to introduce a select between them in VPlan. 7579 if (CM.foldTailByMasking()) { 7580 if (Legal->getPrimaryInduction()) 7581 NeedDef.insert(Legal->getPrimaryInduction()); 7582 for (auto &Reduction : Legal->getReductionVars()) { 7583 NeedDef.insert(Reduction.first); 7584 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7585 } 7586 } 7587 7588 // Collect instructions from the original loop that will become trivially dead 7589 // in the vectorized loop. We don't need to vectorize these instructions. For 7590 // example, original induction update instructions can become dead because we 7591 // separately emit induction "steps" when generating code for the new loop. 7592 // Similarly, we create a new latch condition when setting up the structure 7593 // of the new loop, so the old one can become dead. 7594 SmallPtrSet<Instruction *, 4> DeadInstructions; 7595 collectTriviallyDeadInstructions(DeadInstructions); 7596 7597 // Add assume instructions we need to drop to DeadInstructions, to prevent 7598 // them from being added to the VPlan. 7599 // TODO: We only need to drop assumes in blocks that get flattend. If the 7600 // control flow is preserved, we should keep them. 7601 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7602 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7603 7604 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7605 // Dead instructions do not need sinking. Remove them from SinkAfter. 7606 for (Instruction *I : DeadInstructions) 7607 SinkAfter.erase(I); 7608 7609 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7610 VFRange SubRange = {VF, MaxVF + 1}; 7611 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7612 DeadInstructions, SinkAfter)); 7613 VF = SubRange.End; 7614 } 7615 } 7616 7617 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7618 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7619 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7620 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7621 7622 // Hold a mapping from predicated instructions to their recipes, in order to 7623 // fix their AlsoPack behavior if a user is determined to replicate and use a 7624 // scalar instead of vector value. 7625 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7626 7627 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7628 7629 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 7630 7631 // --------------------------------------------------------------------------- 7632 // Pre-construction: record ingredients whose recipes we'll need to further 7633 // process after constructing the initial VPlan. 7634 // --------------------------------------------------------------------------- 7635 7636 // Mark instructions we'll need to sink later and their targets as 7637 // ingredients whose recipe we'll need to record. 7638 for (auto &Entry : SinkAfter) { 7639 RecipeBuilder.recordRecipeOf(Entry.first); 7640 RecipeBuilder.recordRecipeOf(Entry.second); 7641 } 7642 for (auto &Reduction : CM.getInLoopReductionChains()) { 7643 PHINode *Phi = Reduction.first; 7644 RecurrenceDescriptor::RecurrenceKind Kind = 7645 Legal->getReductionVars()[Phi].getRecurrenceKind(); 7646 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7647 7648 RecipeBuilder.recordRecipeOf(Phi); 7649 for (auto &R : ReductionOperations) { 7650 RecipeBuilder.recordRecipeOf(R); 7651 // For min/max reducitons, where we have a pair of icmp/select, we also 7652 // need to record the ICmp recipe, so it can be removed later. 7653 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7654 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7655 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 7656 } 7657 } 7658 } 7659 7660 // For each interleave group which is relevant for this (possibly trimmed) 7661 // Range, add it to the set of groups to be later applied to the VPlan and add 7662 // placeholders for its members' Recipes which we'll be replacing with a 7663 // single VPInterleaveRecipe. 7664 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7665 auto applyIG = [IG, this](ElementCount VF) -> bool { 7666 return (VF.isVector() && // Query is illegal for VF == 1 7667 CM.getWideningDecision(IG->getInsertPos(), VF) == 7668 LoopVectorizationCostModel::CM_Interleave); 7669 }; 7670 if (!getDecisionAndClampRange(applyIG, Range)) 7671 continue; 7672 InterleaveGroups.insert(IG); 7673 for (unsigned i = 0; i < IG->getFactor(); i++) 7674 if (Instruction *Member = IG->getMember(i)) 7675 RecipeBuilder.recordRecipeOf(Member); 7676 }; 7677 7678 // --------------------------------------------------------------------------- 7679 // Build initial VPlan: Scan the body of the loop in a topological order to 7680 // visit each basic block after having visited its predecessor basic blocks. 7681 // --------------------------------------------------------------------------- 7682 7683 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7684 auto Plan = std::make_unique<VPlan>(); 7685 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7686 Plan->setEntry(VPBB); 7687 7688 // Represent values that will have defs inside VPlan. 7689 for (Value *V : NeedDef) 7690 Plan->addVPValue(V); 7691 7692 // Scan the body of the loop in a topological order to visit each basic block 7693 // after having visited its predecessor basic blocks. 7694 LoopBlocksDFS DFS(OrigLoop); 7695 DFS.perform(LI); 7696 7697 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7698 // Relevant instructions from basic block BB will be grouped into VPRecipe 7699 // ingredients and fill a new VPBasicBlock. 7700 unsigned VPBBsForBB = 0; 7701 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7702 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7703 VPBB = FirstVPBBForBB; 7704 Builder.setInsertPoint(VPBB); 7705 7706 // Introduce each ingredient into VPlan. 7707 // TODO: Model and preserve debug instrinsics in VPlan. 7708 for (Instruction &I : BB->instructionsWithoutDebug()) { 7709 Instruction *Instr = &I; 7710 7711 // First filter out irrelevant instructions, to ensure no recipes are 7712 // built for them. 7713 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 7714 continue; 7715 7716 if (auto Recipe = 7717 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 7718 RecipeBuilder.setRecipe(Instr, Recipe); 7719 VPBB->appendRecipe(Recipe); 7720 continue; 7721 } 7722 7723 // Otherwise, if all widening options failed, Instruction is to be 7724 // replicated. This may create a successor for VPBB. 7725 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7726 Instr, Range, VPBB, PredInst2Recipe, Plan); 7727 if (NextVPBB != VPBB) { 7728 VPBB = NextVPBB; 7729 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7730 : ""); 7731 } 7732 } 7733 } 7734 7735 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7736 // may also be empty, such as the last one VPBB, reflecting original 7737 // basic-blocks with no recipes. 7738 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7739 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7740 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7741 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7742 delete PreEntry; 7743 7744 // --------------------------------------------------------------------------- 7745 // Transform initial VPlan: Apply previously taken decisions, in order, to 7746 // bring the VPlan to its final state. 7747 // --------------------------------------------------------------------------- 7748 7749 // Apply Sink-After legal constraints. 7750 for (auto &Entry : SinkAfter) { 7751 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7752 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7753 Sink->moveAfter(Target); 7754 } 7755 7756 // Interleave memory: for each Interleave Group we marked earlier as relevant 7757 // for this VPlan, replace the Recipes widening its memory instructions with a 7758 // single VPInterleaveRecipe at its insertion point. 7759 for (auto IG : InterleaveGroups) { 7760 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7761 RecipeBuilder.getRecipe(IG->getInsertPos())); 7762 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7763 ->insertBefore(Recipe); 7764 7765 for (unsigned i = 0; i < IG->getFactor(); ++i) 7766 if (Instruction *Member = IG->getMember(i)) { 7767 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7768 } 7769 } 7770 7771 // Adjust the recipes for any inloop reductions. 7772 if (Range.Start > 1) 7773 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 7774 7775 // Finally, if tail is folded by masking, introduce selects between the phi 7776 // and the live-out instruction of each reduction, at the end of the latch. 7777 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 7778 Builder.setInsertPoint(VPBB); 7779 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7780 for (auto &Reduction : Legal->getReductionVars()) { 7781 assert(!CM.isInLoopReduction(Reduction.first) && 7782 "Didn't expect inloop tail folded reduction yet!"); 7783 VPValue *Phi = Plan->getVPValue(Reduction.first); 7784 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7785 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7786 } 7787 } 7788 7789 std::string PlanName; 7790 raw_string_ostream RSO(PlanName); 7791 ElementCount VF = ElementCount::getFixed(Range.Start); 7792 Plan->addVF(VF); 7793 RSO << "Initial VPlan for VF={" << VF; 7794 for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { 7795 Plan->addVF(VF); 7796 RSO << "," << VF; 7797 } 7798 RSO << "},UF>=1"; 7799 RSO.flush(); 7800 Plan->setName(PlanName); 7801 7802 return Plan; 7803 } 7804 7805 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7806 // Outer loop handling: They may require CFG and instruction level 7807 // transformations before even evaluating whether vectorization is profitable. 7808 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7809 // the vectorization pipeline. 7810 assert(!OrigLoop->empty()); 7811 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7812 7813 // Create new empty VPlan 7814 auto Plan = std::make_unique<VPlan>(); 7815 7816 // Build hierarchical CFG 7817 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7818 HCFGBuilder.buildHierarchicalCFG(); 7819 7820 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7821 Plan->addVF(ElementCount::getFixed(VF)); 7822 7823 if (EnableVPlanPredication) { 7824 VPlanPredicator VPP(*Plan); 7825 VPP.predicate(); 7826 7827 // Avoid running transformation to recipes until masked code generation in 7828 // VPlan-native path is in place. 7829 return Plan; 7830 } 7831 7832 SmallPtrSet<Instruction *, 1> DeadInstructions; 7833 VPlanTransforms::VPInstructionsToVPRecipes( 7834 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7835 return Plan; 7836 } 7837 7838 // Adjust the recipes for any inloop reductions. The chain of instructions 7839 // leading from the loop exit instr to the phi need to be converted to 7840 // reductions, with one operand being vector and the other being the scalar 7841 // reduction chain. 7842 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 7843 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 7844 for (auto &Reduction : CM.getInLoopReductionChains()) { 7845 PHINode *Phi = Reduction.first; 7846 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 7847 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 7848 7849 // ReductionOperations are orders top-down from the phi's use to the 7850 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 7851 // which of the two operands will remain scalar and which will be reduced. 7852 // For minmax the chain will be the select instructions. 7853 Instruction *Chain = Phi; 7854 for (Instruction *R : ReductionOperations) { 7855 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 7856 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 7857 7858 VPValue *ChainOp = Plan->getVPValue(Chain); 7859 unsigned FirstOpId; 7860 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7861 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7862 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC && 7863 "Expected to replace a VPWidenSelectSC"); 7864 FirstOpId = 1; 7865 } else { 7866 assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7867 "Expected to replace a VPWidenSC"); 7868 FirstOpId = 0; 7869 } 7870 unsigned VecOpId = 7871 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 7872 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 7873 7874 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 7875 &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI); 7876 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 7877 WidenRecipe->eraseFromParent(); 7878 7879 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7880 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7881 VPRecipeBase *CompareRecipe = 7882 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 7883 assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC && 7884 "Expected to replace a VPWidenSC"); 7885 CompareRecipe->eraseFromParent(); 7886 } 7887 Chain = R; 7888 } 7889 } 7890 } 7891 7892 Value* LoopVectorizationPlanner::VPCallbackILV:: 7893 getOrCreateVectorValues(Value *V, unsigned Part) { 7894 return ILV.getOrCreateVectorValue(V, Part); 7895 } 7896 7897 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7898 Value *V, const VPIteration &Instance) { 7899 return ILV.getOrCreateScalarValue(V, Instance); 7900 } 7901 7902 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7903 VPSlotTracker &SlotTracker) const { 7904 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7905 IG->getInsertPos()->printAsOperand(O, false); 7906 O << ", "; 7907 getAddr()->printAsOperand(O, SlotTracker); 7908 VPValue *Mask = getMask(); 7909 if (Mask) { 7910 O << ", "; 7911 Mask->printAsOperand(O, SlotTracker); 7912 } 7913 for (unsigned i = 0; i < IG->getFactor(); ++i) 7914 if (Instruction *I = IG->getMember(i)) 7915 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 7916 } 7917 7918 void VPWidenCallRecipe::execute(VPTransformState &State) { 7919 State.ILV->widenCallInstruction(Ingredient, User, State); 7920 } 7921 7922 void VPWidenSelectRecipe::execute(VPTransformState &State) { 7923 State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State); 7924 } 7925 7926 void VPWidenRecipe::execute(VPTransformState &State) { 7927 State.ILV->widenInstruction(Ingredient, User, State); 7928 } 7929 7930 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7931 State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant, 7932 IsIndexLoopInvariant, State); 7933 } 7934 7935 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7936 assert(!State.Instance && "Int or FP induction being replicated."); 7937 State.ILV->widenIntOrFpInduction(IV, Trunc); 7938 } 7939 7940 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7941 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7942 } 7943 7944 void VPBlendRecipe::execute(VPTransformState &State) { 7945 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7946 // We know that all PHIs in non-header blocks are converted into 7947 // selects, so we don't have to worry about the insertion order and we 7948 // can just use the builder. 7949 // At this point we generate the predication tree. There may be 7950 // duplications since this is a simple recursive scan, but future 7951 // optimizations will clean it up. 7952 7953 unsigned NumIncoming = getNumIncomingValues(); 7954 7955 // Generate a sequence of selects of the form: 7956 // SELECT(Mask3, In3, 7957 // SELECT(Mask2, In2, 7958 // SELECT(Mask1, In1, 7959 // In0))) 7960 // Note that Mask0 is never used: lanes for which no path reaches this phi and 7961 // are essentially undef are taken from In0. 7962 InnerLoopVectorizer::VectorParts Entry(State.UF); 7963 for (unsigned In = 0; In < NumIncoming; ++In) { 7964 for (unsigned Part = 0; Part < State.UF; ++Part) { 7965 // We might have single edge PHIs (blocks) - use an identity 7966 // 'select' for the first PHI operand. 7967 Value *In0 = State.get(getIncomingValue(In), Part); 7968 if (In == 0) 7969 Entry[Part] = In0; // Initialize with the first incoming value. 7970 else { 7971 // Select between the current value and the previous incoming edge 7972 // based on the incoming mask. 7973 Value *Cond = State.get(getMask(In), Part); 7974 Entry[Part] = 7975 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7976 } 7977 } 7978 } 7979 for (unsigned Part = 0; Part < State.UF; ++Part) 7980 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7981 } 7982 7983 void VPInterleaveRecipe::execute(VPTransformState &State) { 7984 assert(!State.Instance && "Interleave group being replicated."); 7985 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); 7986 } 7987 7988 void VPReductionRecipe::execute(VPTransformState &State) { 7989 assert(!State.Instance && "Reduction being replicated."); 7990 for (unsigned Part = 0; Part < State.UF; ++Part) { 7991 unsigned Kind = RdxDesc->getRecurrenceKind(); 7992 Value *NewVecOp = State.get(VecOp, Part); 7993 Value *NewRed = 7994 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 7995 Value *PrevInChain = State.get(ChainOp, Part); 7996 Value *NextInChain; 7997 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 7998 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 7999 NextInChain = 8000 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8001 NewRed, PrevInChain); 8002 } else { 8003 NextInChain = State.Builder.CreateBinOp( 8004 (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain); 8005 } 8006 State.ValueMap.setVectorValue(I, Part, NextInChain); 8007 } 8008 } 8009 8010 void VPReplicateRecipe::execute(VPTransformState &State) { 8011 if (State.Instance) { // Generate a single instance. 8012 State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance, 8013 IsPredicated, State); 8014 // Insert scalar instance packing it into a vector. 8015 if (AlsoPack && State.VF.isVector()) { 8016 // If we're constructing lane 0, initialize to start from undef. 8017 if (State.Instance->Lane == 0) { 8018 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8019 Value *Undef = 8020 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 8021 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 8022 } 8023 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 8024 } 8025 return; 8026 } 8027 8028 // Generate scalar instances for all VF lanes of all UF parts, unless the 8029 // instruction is uniform inwhich case generate only the first lane for each 8030 // of the UF parts. 8031 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8032 for (unsigned Part = 0; Part < State.UF; ++Part) 8033 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8034 State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, 8035 IsPredicated, State); 8036 } 8037 8038 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8039 assert(State.Instance && "Branch on Mask works only on single instance."); 8040 8041 unsigned Part = State.Instance->Part; 8042 unsigned Lane = State.Instance->Lane; 8043 8044 Value *ConditionBit = nullptr; 8045 VPValue *BlockInMask = getMask(); 8046 if (BlockInMask) { 8047 ConditionBit = State.get(BlockInMask, Part); 8048 if (ConditionBit->getType()->isVectorTy()) 8049 ConditionBit = State.Builder.CreateExtractElement( 8050 ConditionBit, State.Builder.getInt32(Lane)); 8051 } else // Block in mask is all-one. 8052 ConditionBit = State.Builder.getTrue(); 8053 8054 // Replace the temporary unreachable terminator with a new conditional branch, 8055 // whose two destinations will be set later when they are created. 8056 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8057 assert(isa<UnreachableInst>(CurrentTerminator) && 8058 "Expected to replace unreachable terminator with conditional branch."); 8059 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8060 CondBr->setSuccessor(0, nullptr); 8061 ReplaceInstWithInst(CurrentTerminator, CondBr); 8062 } 8063 8064 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8065 assert(State.Instance && "Predicated instruction PHI works per instance."); 8066 Instruction *ScalarPredInst = cast<Instruction>( 8067 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 8068 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8069 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8070 assert(PredicatingBB && "Predicated block has no single predecessor."); 8071 8072 // By current pack/unpack logic we need to generate only a single phi node: if 8073 // a vector value for the predicated instruction exists at this point it means 8074 // the instruction has vector users only, and a phi for the vector value is 8075 // needed. In this case the recipe of the predicated instruction is marked to 8076 // also do that packing, thereby "hoisting" the insert-element sequence. 8077 // Otherwise, a phi node for the scalar value is needed. 8078 unsigned Part = State.Instance->Part; 8079 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8080 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8081 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8082 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8083 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8084 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8085 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8086 } else { 8087 Type *PredInstType = PredInst->getType(); 8088 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8089 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8090 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8091 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8092 } 8093 } 8094 8095 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8096 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 8097 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 8098 getMask()); 8099 } 8100 8101 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8102 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8103 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8104 // for predication. 8105 static ScalarEpilogueLowering getScalarEpilogueLowering( 8106 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8107 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8108 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8109 LoopVectorizationLegality &LVL) { 8110 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8111 // don't look at hints or options, and don't request a scalar epilogue. 8112 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8113 // LoopAccessInfo (due to code dependency and not being able to reliably get 8114 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8115 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8116 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8117 // back to the old way and vectorize with versioning when forced. See D81345.) 8118 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8119 PGSOQueryType::IRPass) && 8120 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8121 return CM_ScalarEpilogueNotAllowedOptSize; 8122 8123 bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() && 8124 !PreferPredicateOverEpilogue; 8125 8126 // 2) Next, if disabling predication is requested on the command line, honour 8127 // this and request a scalar epilogue. 8128 if (PredicateOptDisabled) 8129 return CM_ScalarEpilogueAllowed; 8130 8131 // 3) and 4) look if enabling predication is requested on the command line, 8132 // with a loop hint, or if the TTI hook indicates this is profitable, request 8133 // predication. 8134 if (PreferPredicateOverEpilogue || 8135 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 8136 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8137 LVL.getLAI()) && 8138 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 8139 return CM_ScalarEpilogueNotNeededUsePredicate; 8140 8141 return CM_ScalarEpilogueAllowed; 8142 } 8143 8144 // Process the loop in the VPlan-native vectorization path. This path builds 8145 // VPlan upfront in the vectorization pipeline, which allows to apply 8146 // VPlan-to-VPlan transformations from the very beginning without modifying the 8147 // input LLVM IR. 8148 static bool processLoopInVPlanNativePath( 8149 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8150 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8151 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8152 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8153 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8154 8155 if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) { 8156 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8157 return false; 8158 } 8159 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8160 Function *F = L->getHeader()->getParent(); 8161 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8162 8163 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8164 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8165 8166 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8167 &Hints, IAI); 8168 // Use the planner for outer loop vectorization. 8169 // TODO: CM is not used at this point inside the planner. Turn CM into an 8170 // optional argument if we don't need it in the future. 8171 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8172 8173 // Get user vectorization factor. 8174 const unsigned UserVF = Hints.getWidth(); 8175 8176 // Plan how to best vectorize, return the best VF and its cost. 8177 const VectorizationFactor VF = 8178 LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF)); 8179 8180 // If we are stress testing VPlan builds, do not attempt to generate vector 8181 // code. Masked vector code generation support will follow soon. 8182 // Also, do not attempt to vectorize if no vector code will be produced. 8183 if (VPlanBuildStressTest || EnableVPlanPredication || 8184 VectorizationFactor::Disabled() == VF) 8185 return false; 8186 8187 LVP.setBestPlan(VF.Width, 1); 8188 8189 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8190 &CM, BFI, PSI); 8191 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8192 << L->getHeader()->getParent()->getName() << "\"\n"); 8193 LVP.executePlan(LB, DT); 8194 8195 // Mark the loop as already vectorized to avoid vectorizing again. 8196 Hints.setAlreadyVectorized(); 8197 8198 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8199 return true; 8200 } 8201 8202 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8203 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8204 !EnableLoopInterleaving), 8205 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8206 !EnableLoopVectorization) {} 8207 8208 bool LoopVectorizePass::processLoop(Loop *L) { 8209 assert((EnableVPlanNativePath || L->empty()) && 8210 "VPlan-native path is not enabled. Only process inner loops."); 8211 8212 #ifndef NDEBUG 8213 const std::string DebugLocStr = getDebugLocString(L); 8214 #endif /* NDEBUG */ 8215 8216 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8217 << L->getHeader()->getParent()->getName() << "\" from " 8218 << DebugLocStr << "\n"); 8219 8220 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8221 8222 LLVM_DEBUG( 8223 dbgs() << "LV: Loop hints:" 8224 << " force=" 8225 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8226 ? "disabled" 8227 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8228 ? "enabled" 8229 : "?")) 8230 << " width=" << Hints.getWidth() 8231 << " unroll=" << Hints.getInterleave() << "\n"); 8232 8233 // Function containing loop 8234 Function *F = L->getHeader()->getParent(); 8235 8236 // Looking at the diagnostic output is the only way to determine if a loop 8237 // was vectorized (other than looking at the IR or machine code), so it 8238 // is important to generate an optimization remark for each loop. Most of 8239 // these messages are generated as OptimizationRemarkAnalysis. Remarks 8240 // generated as OptimizationRemark and OptimizationRemarkMissed are 8241 // less verbose reporting vectorized loops and unvectorized loops that may 8242 // benefit from vectorization, respectively. 8243 8244 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 8245 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 8246 return false; 8247 } 8248 8249 PredicatedScalarEvolution PSE(*SE, *L); 8250 8251 // Check if it is legal to vectorize the loop. 8252 LoopVectorizationRequirements Requirements(*ORE); 8253 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 8254 &Requirements, &Hints, DB, AC, BFI, PSI); 8255 if (!LVL.canVectorize(EnableVPlanNativePath)) { 8256 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 8257 Hints.emitRemarkWithHints(); 8258 return false; 8259 } 8260 8261 // Check the function attributes and profiles to find out if this function 8262 // should be optimized for size. 8263 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8264 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 8265 8266 // Entrance to the VPlan-native vectorization path. Outer loops are processed 8267 // here. They may require CFG and instruction level transformations before 8268 // even evaluating whether vectorization is profitable. Since we cannot modify 8269 // the incoming IR, we need to build VPlan upfront in the vectorization 8270 // pipeline. 8271 if (!L->empty()) 8272 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 8273 ORE, BFI, PSI, Hints); 8274 8275 assert(L->empty() && "Inner loop expected."); 8276 8277 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 8278 // count by optimizing for size, to minimize overheads. 8279 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 8280 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 8281 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 8282 << "This loop is worth vectorizing only if no scalar " 8283 << "iteration overheads are incurred."); 8284 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 8285 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 8286 else { 8287 LLVM_DEBUG(dbgs() << "\n"); 8288 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 8289 } 8290 } 8291 8292 // Check the function attributes to see if implicit floats are allowed. 8293 // FIXME: This check doesn't seem possibly correct -- what if the loop is 8294 // an integer loop and the vector instructions selected are purely integer 8295 // vector instructions? 8296 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 8297 reportVectorizationFailure( 8298 "Can't vectorize when the NoImplicitFloat attribute is used", 8299 "loop not vectorized due to NoImplicitFloat attribute", 8300 "NoImplicitFloat", ORE, L); 8301 Hints.emitRemarkWithHints(); 8302 return false; 8303 } 8304 8305 // Check if the target supports potentially unsafe FP vectorization. 8306 // FIXME: Add a check for the type of safety issue (denormal, signaling) 8307 // for the target we're vectorizing for, to make sure none of the 8308 // additional fp-math flags can help. 8309 if (Hints.isPotentiallyUnsafe() && 8310 TTI->isFPVectorizationPotentiallyUnsafe()) { 8311 reportVectorizationFailure( 8312 "Potentially unsafe FP op prevents vectorization", 8313 "loop not vectorized due to unsafe FP support.", 8314 "UnsafeFP", ORE, L); 8315 Hints.emitRemarkWithHints(); 8316 return false; 8317 } 8318 8319 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 8320 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 8321 8322 // If an override option has been passed in for interleaved accesses, use it. 8323 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 8324 UseInterleaved = EnableInterleavedMemAccesses; 8325 8326 // Analyze interleaved memory accesses. 8327 if (UseInterleaved) { 8328 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 8329 } 8330 8331 // Use the cost model. 8332 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 8333 F, &Hints, IAI); 8334 CM.collectValuesToIgnore(); 8335 8336 // Use the planner for vectorization. 8337 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 8338 8339 // Get user vectorization factor and interleave count. 8340 unsigned UserVF = Hints.getWidth(); 8341 unsigned UserIC = Hints.getInterleave(); 8342 8343 // Plan how to best vectorize, return the best VF and its cost. 8344 Optional<VectorizationFactor> MaybeVF = 8345 LVP.plan(ElementCount::getFixed(UserVF), UserIC); 8346 8347 VectorizationFactor VF = VectorizationFactor::Disabled(); 8348 unsigned IC = 1; 8349 8350 if (MaybeVF) { 8351 VF = *MaybeVF; 8352 // Select the interleave count. 8353 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 8354 } 8355 8356 // Identify the diagnostic messages that should be produced. 8357 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 8358 bool VectorizeLoop = true, InterleaveLoop = true; 8359 if (Requirements.doesNotMeet(F, L, Hints)) { 8360 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 8361 "requirements.\n"); 8362 Hints.emitRemarkWithHints(); 8363 return false; 8364 } 8365 8366 if (VF.Width == 1) { 8367 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 8368 VecDiagMsg = std::make_pair( 8369 "VectorizationNotBeneficial", 8370 "the cost-model indicates that vectorization is not beneficial"); 8371 VectorizeLoop = false; 8372 } 8373 8374 if (!MaybeVF && UserIC > 1) { 8375 // Tell the user interleaving was avoided up-front, despite being explicitly 8376 // requested. 8377 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 8378 "interleaving should be avoided up front\n"); 8379 IntDiagMsg = std::make_pair( 8380 "InterleavingAvoided", 8381 "Ignoring UserIC, because interleaving was avoided up front"); 8382 InterleaveLoop = false; 8383 } else if (IC == 1 && UserIC <= 1) { 8384 // Tell the user interleaving is not beneficial. 8385 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 8386 IntDiagMsg = std::make_pair( 8387 "InterleavingNotBeneficial", 8388 "the cost-model indicates that interleaving is not beneficial"); 8389 InterleaveLoop = false; 8390 if (UserIC == 1) { 8391 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 8392 IntDiagMsg.second += 8393 " and is explicitly disabled or interleave count is set to 1"; 8394 } 8395 } else if (IC > 1 && UserIC == 1) { 8396 // Tell the user interleaving is beneficial, but it explicitly disabled. 8397 LLVM_DEBUG( 8398 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 8399 IntDiagMsg = std::make_pair( 8400 "InterleavingBeneficialButDisabled", 8401 "the cost-model indicates that interleaving is beneficial " 8402 "but is explicitly disabled or interleave count is set to 1"); 8403 InterleaveLoop = false; 8404 } 8405 8406 // Override IC if user provided an interleave count. 8407 IC = UserIC > 0 ? UserIC : IC; 8408 8409 // Emit diagnostic messages, if any. 8410 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 8411 if (!VectorizeLoop && !InterleaveLoop) { 8412 // Do not vectorize or interleaving the loop. 8413 ORE->emit([&]() { 8414 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 8415 L->getStartLoc(), L->getHeader()) 8416 << VecDiagMsg.second; 8417 }); 8418 ORE->emit([&]() { 8419 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 8420 L->getStartLoc(), L->getHeader()) 8421 << IntDiagMsg.second; 8422 }); 8423 return false; 8424 } else if (!VectorizeLoop && InterleaveLoop) { 8425 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8426 ORE->emit([&]() { 8427 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 8428 L->getStartLoc(), L->getHeader()) 8429 << VecDiagMsg.second; 8430 }); 8431 } else if (VectorizeLoop && !InterleaveLoop) { 8432 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8433 << ") in " << DebugLocStr << '\n'); 8434 ORE->emit([&]() { 8435 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 8436 L->getStartLoc(), L->getHeader()) 8437 << IntDiagMsg.second; 8438 }); 8439 } else if (VectorizeLoop && InterleaveLoop) { 8440 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 8441 << ") in " << DebugLocStr << '\n'); 8442 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 8443 } 8444 8445 LVP.setBestPlan(VF.Width, IC); 8446 8447 using namespace ore; 8448 bool DisableRuntimeUnroll = false; 8449 MDNode *OrigLoopID = L->getLoopID(); 8450 8451 if (!VectorizeLoop) { 8452 assert(IC > 1 && "interleave count should not be 1 or 0"); 8453 // If we decided that it is not legal to vectorize the loop, then 8454 // interleave it. 8455 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 8456 BFI, PSI); 8457 LVP.executePlan(Unroller, DT); 8458 8459 ORE->emit([&]() { 8460 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 8461 L->getHeader()) 8462 << "interleaved loop (interleaved count: " 8463 << NV("InterleaveCount", IC) << ")"; 8464 }); 8465 } else { 8466 // If we decided that it is *legal* to vectorize the loop, then do it. 8467 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 8468 &LVL, &CM, BFI, PSI); 8469 LVP.executePlan(LB, DT); 8470 ++LoopsVectorized; 8471 8472 // Add metadata to disable runtime unrolling a scalar loop when there are 8473 // no runtime checks about strides and memory. A scalar loop that is 8474 // rarely used is not worth unrolling. 8475 if (!LB.areSafetyChecksAdded()) 8476 DisableRuntimeUnroll = true; 8477 8478 // Report the vectorization decision. 8479 ORE->emit([&]() { 8480 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 8481 L->getHeader()) 8482 << "vectorized loop (vectorization width: " 8483 << NV("VectorizationFactor", VF.Width) 8484 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 8485 }); 8486 } 8487 8488 Optional<MDNode *> RemainderLoopID = 8489 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 8490 LLVMLoopVectorizeFollowupEpilogue}); 8491 if (RemainderLoopID.hasValue()) { 8492 L->setLoopID(RemainderLoopID.getValue()); 8493 } else { 8494 if (DisableRuntimeUnroll) 8495 AddRuntimeUnrollDisableMetaData(L); 8496 8497 // Mark the loop as already vectorized to avoid vectorizing again. 8498 Hints.setAlreadyVectorized(); 8499 } 8500 8501 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8502 return true; 8503 } 8504 8505 LoopVectorizeResult LoopVectorizePass::runImpl( 8506 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 8507 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 8508 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 8509 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 8510 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 8511 SE = &SE_; 8512 LI = &LI_; 8513 TTI = &TTI_; 8514 DT = &DT_; 8515 BFI = &BFI_; 8516 TLI = TLI_; 8517 AA = &AA_; 8518 AC = &AC_; 8519 GetLAA = &GetLAA_; 8520 DB = &DB_; 8521 ORE = &ORE_; 8522 PSI = PSI_; 8523 8524 // Don't attempt if 8525 // 1. the target claims to have no vector registers, and 8526 // 2. interleaving won't help ILP. 8527 // 8528 // The second condition is necessary because, even if the target has no 8529 // vector registers, loop vectorization may still enable scalar 8530 // interleaving. 8531 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 8532 TTI->getMaxInterleaveFactor(1) < 2) 8533 return LoopVectorizeResult(false, false); 8534 8535 bool Changed = false, CFGChanged = false; 8536 8537 // The vectorizer requires loops to be in simplified form. 8538 // Since simplification may add new inner loops, it has to run before the 8539 // legality and profitability checks. This means running the loop vectorizer 8540 // will simplify all loops, regardless of whether anything end up being 8541 // vectorized. 8542 for (auto &L : *LI) 8543 Changed |= CFGChanged |= 8544 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 8545 8546 // Build up a worklist of inner-loops to vectorize. This is necessary as 8547 // the act of vectorizing or partially unrolling a loop creates new loops 8548 // and can invalidate iterators across the loops. 8549 SmallVector<Loop *, 8> Worklist; 8550 8551 for (Loop *L : *LI) 8552 collectSupportedLoops(*L, LI, ORE, Worklist); 8553 8554 LoopsAnalyzed += Worklist.size(); 8555 8556 // Now walk the identified inner loops. 8557 while (!Worklist.empty()) { 8558 Loop *L = Worklist.pop_back_val(); 8559 8560 // For the inner loops we actually process, form LCSSA to simplify the 8561 // transform. 8562 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 8563 8564 Changed |= CFGChanged |= processLoop(L); 8565 } 8566 8567 // Process each loop nest in the function. 8568 return LoopVectorizeResult(Changed, CFGChanged); 8569 } 8570 8571 PreservedAnalyses LoopVectorizePass::run(Function &F, 8572 FunctionAnalysisManager &AM) { 8573 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 8574 auto &LI = AM.getResult<LoopAnalysis>(F); 8575 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 8576 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8577 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8578 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8579 auto &AA = AM.getResult<AAManager>(F); 8580 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8581 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8582 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8583 MemorySSA *MSSA = EnableMSSALoopDependency 8584 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8585 : nullptr; 8586 8587 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8588 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8589 [&](Loop &L) -> const LoopAccessInfo & { 8590 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8591 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8592 }; 8593 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 8594 ProfileSummaryInfo *PSI = 8595 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8596 LoopVectorizeResult Result = 8597 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8598 if (!Result.MadeAnyChange) 8599 return PreservedAnalyses::all(); 8600 PreservedAnalyses PA; 8601 8602 // We currently do not preserve loopinfo/dominator analyses with outer loop 8603 // vectorization. Until this is addressed, mark these analyses as preserved 8604 // only for non-VPlan-native path. 8605 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8606 if (!EnableVPlanNativePath) { 8607 PA.preserve<LoopAnalysis>(); 8608 PA.preserve<DominatorTreeAnalysis>(); 8609 } 8610 PA.preserve<BasicAA>(); 8611 PA.preserve<GlobalsAA>(); 8612 if (!Result.MadeCFGChange) 8613 PA.preserveSet<CFGAnalyses>(); 8614 return PA; 8615 } 8616