1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 // Determine if an array of VF elements of type Ty is "bitcast compatible" 371 // with a <VF x Ty> vector. 372 if (VF.isVector()) { 373 auto *VectorTy = VectorType::get(Ty, VF); 374 return TypeSize::get(VF.getKnownMinValue() * 375 DL.getTypeAllocSize(Ty).getFixedValue(), 376 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 377 } 378 379 // If the vectorization factor is one, we just check if an array of type Ty 380 // requires padding between elements. 381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 382 } 383 384 /// A helper function that returns the reciprocal of the block probability of 385 /// predicated blocks. If we return X, we are assuming the predicated block 386 /// will execute once for every X iterations of the loop header. 387 /// 388 /// TODO: We should use actual block probability here, if available. Currently, 389 /// we always assume predicated blocks have a 50% chance of executing. 390 static unsigned getReciprocalPredBlockProb() { return 2; } 391 392 /// A helper function that adds a 'fast' flag to floating-point operations. 393 static Value *addFastMathFlag(Value *V) { 394 if (isa<FPMathOperator>(V)) 395 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 396 return V; 397 } 398 399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 400 if (isa<FPMathOperator>(V)) 401 cast<Instruction>(V)->setFastMathFlags(FMF); 402 return V; 403 } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 namespace llvm { 436 437 /// InnerLoopVectorizer vectorizes loops which contain only one basic 438 /// block to a specified vectorization factor (VF). 439 /// This class performs the widening of scalars into vectors, or multiple 440 /// scalars. This class also implements the following features: 441 /// * It inserts an epilogue loop for handling loops that don't have iteration 442 /// counts that are known to be a multiple of the vectorization factor. 443 /// * It handles the code generation for reduction variables. 444 /// * Scalarization (implementation using scalars) of un-vectorizable 445 /// instructions. 446 /// InnerLoopVectorizer does not perform any vectorization-legality 447 /// checks, and relies on the caller to check for the different legality 448 /// aspects. The InnerLoopVectorizer relies on the 449 /// LoopVectorizationLegality class to provide information about the induction 450 /// and reduction variables that were found to a given vectorization factor. 451 class InnerLoopVectorizer { 452 public: 453 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 454 LoopInfo *LI, DominatorTree *DT, 455 const TargetLibraryInfo *TLI, 456 const TargetTransformInfo *TTI, AssumptionCache *AC, 457 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 458 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 459 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 460 ProfileSummaryInfo *PSI) 461 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 462 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 463 Builder(PSE.getSE()->getContext()), 464 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 465 BFI(BFI), PSI(PSI) { 466 // Query this against the original loop and save it here because the profile 467 // of the original loop header may change as the transformation happens. 468 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 469 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 470 } 471 472 virtual ~InnerLoopVectorizer() = default; 473 474 /// Create a new empty loop that will contain vectorized instructions later 475 /// on, while the old loop will be used as the scalar remainder. Control flow 476 /// is generated around the vectorized (and scalar epilogue) loops consisting 477 /// of various checks and bypasses. Return the pre-header block of the new 478 /// loop. 479 /// In the case of epilogue vectorization, this function is overriden to 480 /// handle the more complex control flow around the loops. 481 virtual BasicBlock *createVectorizedLoopSkeleton(); 482 483 /// Widen a single instruction within the innermost loop. 484 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 485 VPTransformState &State); 486 487 /// Widen a single call instruction within the innermost loop. 488 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 489 VPTransformState &State); 490 491 /// Widen a single select instruction within the innermost loop. 492 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 493 bool InvariantCond, VPTransformState &State); 494 495 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 496 void fixVectorizedLoop(); 497 498 // Return true if any runtime check is added. 499 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 500 501 /// A type for vectorized values in the new loop. Each value from the 502 /// original loop, when vectorized, is represented by UF vector values in the 503 /// new unrolled loop, where UF is the unroll factor. 504 using VectorParts = SmallVector<Value *, 2>; 505 506 /// Vectorize a single GetElementPtrInst based on information gathered and 507 /// decisions taken during planning. 508 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 509 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 510 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 511 512 /// Vectorize a single PHINode in a block. This method handles the induction 513 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 514 /// arbitrary length vectors. 515 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 516 517 /// A helper function to scalarize a single Instruction in the innermost loop. 518 /// Generates a sequence of scalar instances for each lane between \p MinLane 519 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 520 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 521 /// Instr's operands. 522 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 523 const VPIteration &Instance, bool IfPredicateInstr, 524 VPTransformState &State); 525 526 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 527 /// is provided, the integer induction variable will first be truncated to 528 /// the corresponding type. 529 void widenIntOrFpInduction(PHINode *IV, Value *Start, 530 TruncInst *Trunc = nullptr); 531 532 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 533 /// vector or scalar value on-demand if one is not yet available. When 534 /// vectorizing a loop, we visit the definition of an instruction before its 535 /// uses. When visiting the definition, we either vectorize or scalarize the 536 /// instruction, creating an entry for it in the corresponding map. (In some 537 /// cases, such as induction variables, we will create both vector and scalar 538 /// entries.) Then, as we encounter uses of the definition, we derive values 539 /// for each scalar or vector use unless such a value is already available. 540 /// For example, if we scalarize a definition and one of its uses is vector, 541 /// we build the required vector on-demand with an insertelement sequence 542 /// when visiting the use. Otherwise, if the use is scalar, we can use the 543 /// existing scalar definition. 544 /// 545 /// Return a value in the new loop corresponding to \p V from the original 546 /// loop at unroll index \p Part. If the value has already been vectorized, 547 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 548 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 549 /// a new vector value on-demand by inserting the scalar values into a vector 550 /// with an insertelement sequence. If the value has been neither vectorized 551 /// nor scalarized, it must be loop invariant, so we simply broadcast the 552 /// value into a vector. 553 Value *getOrCreateVectorValue(Value *V, unsigned Part); 554 555 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 556 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 557 } 558 559 /// Return a value in the new loop corresponding to \p V from the original 560 /// loop at unroll and vector indices \p Instance. If the value has been 561 /// vectorized but not scalarized, the necessary extractelement instruction 562 /// will be generated. 563 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 564 565 /// Construct the vector value of a scalarized value \p V one lane at a time. 566 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 567 568 /// Try to vectorize interleaved access group \p Group with the base address 569 /// given in \p Addr, optionally masking the vector operations if \p 570 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 571 /// values in the vectorized loop. 572 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 573 ArrayRef<VPValue *> VPDefs, 574 VPTransformState &State, VPValue *Addr, 575 ArrayRef<VPValue *> StoredValues, 576 VPValue *BlockInMask = nullptr); 577 578 /// Vectorize Load and Store instructions with the base address given in \p 579 /// Addr, optionally masking the vector operations if \p BlockInMask is 580 /// non-null. Use \p State to translate given VPValues to IR values in the 581 /// vectorized loop. 582 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 583 VPValue *Def, VPValue *Addr, 584 VPValue *StoredValue, VPValue *BlockInMask); 585 586 /// Set the debug location in the builder using the debug location in 587 /// the instruction. 588 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 589 590 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 591 void fixNonInductionPHIs(void); 592 593 protected: 594 friend class LoopVectorizationPlanner; 595 596 /// A small list of PHINodes. 597 using PhiVector = SmallVector<PHINode *, 4>; 598 599 /// A type for scalarized values in the new loop. Each value from the 600 /// original loop, when scalarized, is represented by UF x VF scalar values 601 /// in the new unrolled loop, where UF is the unroll factor and VF is the 602 /// vectorization factor. 603 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 604 605 /// Set up the values of the IVs correctly when exiting the vector loop. 606 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 607 Value *CountRoundDown, Value *EndValue, 608 BasicBlock *MiddleBlock); 609 610 /// Create a new induction variable inside L. 611 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 612 Value *Step, Instruction *DL); 613 614 /// Handle all cross-iteration phis in the header. 615 void fixCrossIterationPHIs(); 616 617 /// Fix a first-order recurrence. This is the second phase of vectorizing 618 /// this phi node. 619 void fixFirstOrderRecurrence(PHINode *Phi); 620 621 /// Fix a reduction cross-iteration phi. This is the second phase of 622 /// vectorizing this phi node. 623 void fixReduction(PHINode *Phi); 624 625 /// Clear NSW/NUW flags from reduction instructions if necessary. 626 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 627 628 /// The Loop exit block may have single value PHI nodes with some 629 /// incoming value. While vectorizing we only handled real values 630 /// that were defined inside the loop and we should have one value for 631 /// each predecessor of its parent basic block. See PR14725. 632 void fixLCSSAPHIs(); 633 634 /// Iteratively sink the scalarized operands of a predicated instruction into 635 /// the block that was created for it. 636 void sinkScalarOperands(Instruction *PredInst); 637 638 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 639 /// represented as. 640 void truncateToMinimalBitwidths(); 641 642 /// Create a broadcast instruction. This method generates a broadcast 643 /// instruction (shuffle) for loop invariant values and for the induction 644 /// value. If this is the induction variable then we extend it to N, N+1, ... 645 /// this is needed because each iteration in the loop corresponds to a SIMD 646 /// element. 647 virtual Value *getBroadcastInstrs(Value *V); 648 649 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 650 /// to each vector element of Val. The sequence starts at StartIndex. 651 /// \p Opcode is relevant for FP induction variable. 652 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 653 Instruction::BinaryOps Opcode = 654 Instruction::BinaryOpsEnd); 655 656 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 657 /// variable on which to base the steps, \p Step is the size of the step, and 658 /// \p EntryVal is the value from the original loop that maps to the steps. 659 /// Note that \p EntryVal doesn't have to be an induction variable - it 660 /// can also be a truncate instruction. 661 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 662 const InductionDescriptor &ID); 663 664 /// Create a vector induction phi node based on an existing scalar one. \p 665 /// EntryVal is the value from the original loop that maps to the vector phi 666 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 667 /// truncate instruction, instead of widening the original IV, we widen a 668 /// version of the IV truncated to \p EntryVal's type. 669 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 670 Value *Step, Value *Start, 671 Instruction *EntryVal); 672 673 /// Returns true if an instruction \p I should be scalarized instead of 674 /// vectorized for the chosen vectorization factor. 675 bool shouldScalarizeInstruction(Instruction *I) const; 676 677 /// Returns true if we should generate a scalar version of \p IV. 678 bool needsScalarInduction(Instruction *IV) const; 679 680 /// If there is a cast involved in the induction variable \p ID, which should 681 /// be ignored in the vectorized loop body, this function records the 682 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 683 /// cast. We had already proved that the casted Phi is equal to the uncasted 684 /// Phi in the vectorized loop (under a runtime guard), and therefore 685 /// there is no need to vectorize the cast - the same value can be used in the 686 /// vector loop for both the Phi and the cast. 687 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 688 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 689 /// 690 /// \p EntryVal is the value from the original loop that maps to the vector 691 /// phi node and is used to distinguish what is the IV currently being 692 /// processed - original one (if \p EntryVal is a phi corresponding to the 693 /// original IV) or the "newly-created" one based on the proof mentioned above 694 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 695 /// latter case \p EntryVal is a TruncInst and we must not record anything for 696 /// that IV, but it's error-prone to expect callers of this routine to care 697 /// about that, hence this explicit parameter. 698 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 699 const Instruction *EntryVal, 700 Value *VectorLoopValue, 701 unsigned Part, 702 unsigned Lane = UINT_MAX); 703 704 /// Generate a shuffle sequence that will reverse the vector Vec. 705 virtual Value *reverseVector(Value *Vec); 706 707 /// Returns (and creates if needed) the original loop trip count. 708 Value *getOrCreateTripCount(Loop *NewLoop); 709 710 /// Returns (and creates if needed) the trip count of the widened loop. 711 Value *getOrCreateVectorTripCount(Loop *NewLoop); 712 713 /// Returns a bitcasted value to the requested vector type. 714 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 715 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 716 const DataLayout &DL); 717 718 /// Emit a bypass check to see if the vector trip count is zero, including if 719 /// it overflows. 720 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 721 722 /// Emit a bypass check to see if all of the SCEV assumptions we've 723 /// had to make are correct. 724 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 725 726 /// Emit bypass checks to check any memory assumptions we may have made. 727 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 728 729 /// Compute the transformed value of Index at offset StartValue using step 730 /// StepValue. 731 /// For integer induction, returns StartValue + Index * StepValue. 732 /// For pointer induction, returns StartValue[Index * StepValue]. 733 /// FIXME: The newly created binary instructions should contain nsw/nuw 734 /// flags, which can be found from the original scalar operations. 735 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 736 const DataLayout &DL, 737 const InductionDescriptor &ID) const; 738 739 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 740 /// vector loop preheader, middle block and scalar preheader. Also 741 /// allocate a loop object for the new vector loop and return it. 742 Loop *createVectorLoopSkeleton(StringRef Prefix); 743 744 /// Create new phi nodes for the induction variables to resume iteration count 745 /// in the scalar epilogue, from where the vectorized loop left off (given by 746 /// \p VectorTripCount). 747 /// In cases where the loop skeleton is more complicated (eg. epilogue 748 /// vectorization) and the resume values can come from an additional bypass 749 /// block, the \p AdditionalBypass pair provides information about the bypass 750 /// block and the end value on the edge from bypass to this loop. 751 void createInductionResumeValues( 752 Loop *L, Value *VectorTripCount, 753 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 754 755 /// Complete the loop skeleton by adding debug MDs, creating appropriate 756 /// conditional branches in the middle block, preparing the builder and 757 /// running the verifier. Take in the vector loop \p L as argument, and return 758 /// the preheader of the completed vector loop. 759 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 760 761 /// Add additional metadata to \p To that was not present on \p Orig. 762 /// 763 /// Currently this is used to add the noalias annotations based on the 764 /// inserted memchecks. Use this for instructions that are *cloned* into the 765 /// vector loop. 766 void addNewMetadata(Instruction *To, const Instruction *Orig); 767 768 /// Add metadata from one instruction to another. 769 /// 770 /// This includes both the original MDs from \p From and additional ones (\see 771 /// addNewMetadata). Use this for *newly created* instructions in the vector 772 /// loop. 773 void addMetadata(Instruction *To, Instruction *From); 774 775 /// Similar to the previous function but it adds the metadata to a 776 /// vector of instructions. 777 void addMetadata(ArrayRef<Value *> To, Instruction *From); 778 779 /// Allow subclasses to override and print debug traces before/after vplan 780 /// execution, when trace information is requested. 781 virtual void printDebugTracesAtStart(){}; 782 virtual void printDebugTracesAtEnd(){}; 783 784 /// The original loop. 785 Loop *OrigLoop; 786 787 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 788 /// dynamic knowledge to simplify SCEV expressions and converts them to a 789 /// more usable form. 790 PredicatedScalarEvolution &PSE; 791 792 /// Loop Info. 793 LoopInfo *LI; 794 795 /// Dominator Tree. 796 DominatorTree *DT; 797 798 /// Alias Analysis. 799 AAResults *AA; 800 801 /// Target Library Info. 802 const TargetLibraryInfo *TLI; 803 804 /// Target Transform Info. 805 const TargetTransformInfo *TTI; 806 807 /// Assumption Cache. 808 AssumptionCache *AC; 809 810 /// Interface to emit optimization remarks. 811 OptimizationRemarkEmitter *ORE; 812 813 /// LoopVersioning. It's only set up (non-null) if memchecks were 814 /// used. 815 /// 816 /// This is currently only used to add no-alias metadata based on the 817 /// memchecks. The actually versioning is performed manually. 818 std::unique_ptr<LoopVersioning> LVer; 819 820 /// The vectorization SIMD factor to use. Each vector will have this many 821 /// vector elements. 822 ElementCount VF; 823 824 /// The vectorization unroll factor to use. Each scalar is vectorized to this 825 /// many different vector instructions. 826 unsigned UF; 827 828 /// The builder that we use 829 IRBuilder<> Builder; 830 831 // --- Vectorization state --- 832 833 /// The vector-loop preheader. 834 BasicBlock *LoopVectorPreHeader; 835 836 /// The scalar-loop preheader. 837 BasicBlock *LoopScalarPreHeader; 838 839 /// Middle Block between the vector and the scalar. 840 BasicBlock *LoopMiddleBlock; 841 842 /// The (unique) ExitBlock of the scalar loop. Note that 843 /// there can be multiple exiting edges reaching this block. 844 BasicBlock *LoopExitBlock; 845 846 /// The vector loop body. 847 BasicBlock *LoopVectorBody; 848 849 /// The scalar loop body. 850 BasicBlock *LoopScalarBody; 851 852 /// A list of all bypass blocks. The first block is the entry of the loop. 853 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 854 855 /// The new Induction variable which was added to the new block. 856 PHINode *Induction = nullptr; 857 858 /// The induction variable of the old basic block. 859 PHINode *OldInduction = nullptr; 860 861 /// Maps values from the original loop to their corresponding values in the 862 /// vectorized loop. A key value can map to either vector values, scalar 863 /// values or both kinds of values, depending on whether the key was 864 /// vectorized and scalarized. 865 VectorizerValueMap VectorLoopValueMap; 866 867 /// Store instructions that were predicated. 868 SmallVector<Instruction *, 4> PredicatedInstructions; 869 870 /// Trip count of the original loop. 871 Value *TripCount = nullptr; 872 873 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 874 Value *VectorTripCount = nullptr; 875 876 /// The legality analysis. 877 LoopVectorizationLegality *Legal; 878 879 /// The profitablity analysis. 880 LoopVectorizationCostModel *Cost; 881 882 // Record whether runtime checks are added. 883 bool AddedSafetyChecks = false; 884 885 // Holds the end values for each induction variable. We save the end values 886 // so we can later fix-up the external users of the induction variables. 887 DenseMap<PHINode *, Value *> IVEndValues; 888 889 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 890 // fixed up at the end of vector code generation. 891 SmallVector<PHINode *, 8> OrigPHIsToFix; 892 893 /// BFI and PSI are used to check for profile guided size optimizations. 894 BlockFrequencyInfo *BFI; 895 ProfileSummaryInfo *PSI; 896 897 // Whether this loop should be optimized for size based on profile guided size 898 // optimizatios. 899 bool OptForSizeBasedOnProfile; 900 }; 901 902 class InnerLoopUnroller : public InnerLoopVectorizer { 903 public: 904 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 905 LoopInfo *LI, DominatorTree *DT, 906 const TargetLibraryInfo *TLI, 907 const TargetTransformInfo *TTI, AssumptionCache *AC, 908 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 909 LoopVectorizationLegality *LVL, 910 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 911 ProfileSummaryInfo *PSI) 912 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 913 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 914 BFI, PSI) {} 915 916 private: 917 Value *getBroadcastInstrs(Value *V) override; 918 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 919 Instruction::BinaryOps Opcode = 920 Instruction::BinaryOpsEnd) override; 921 Value *reverseVector(Value *Vec) override; 922 }; 923 924 /// Encapsulate information regarding vectorization of a loop and its epilogue. 925 /// This information is meant to be updated and used across two stages of 926 /// epilogue vectorization. 927 struct EpilogueLoopVectorizationInfo { 928 ElementCount MainLoopVF = ElementCount::getFixed(0); 929 unsigned MainLoopUF = 0; 930 ElementCount EpilogueVF = ElementCount::getFixed(0); 931 unsigned EpilogueUF = 0; 932 BasicBlock *MainLoopIterationCountCheck = nullptr; 933 BasicBlock *EpilogueIterationCountCheck = nullptr; 934 BasicBlock *SCEVSafetyCheck = nullptr; 935 BasicBlock *MemSafetyCheck = nullptr; 936 Value *TripCount = nullptr; 937 Value *VectorTripCount = nullptr; 938 939 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 940 unsigned EUF) 941 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 942 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 943 assert(EUF == 1 && 944 "A high UF for the epilogue loop is likely not beneficial."); 945 } 946 }; 947 948 /// An extension of the inner loop vectorizer that creates a skeleton for a 949 /// vectorized loop that has its epilogue (residual) also vectorized. 950 /// The idea is to run the vplan on a given loop twice, firstly to setup the 951 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 952 /// from the first step and vectorize the epilogue. This is achieved by 953 /// deriving two concrete strategy classes from this base class and invoking 954 /// them in succession from the loop vectorizer planner. 955 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 956 public: 957 InnerLoopAndEpilogueVectorizer( 958 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 959 DominatorTree *DT, const TargetLibraryInfo *TLI, 960 const TargetTransformInfo *TTI, AssumptionCache *AC, 961 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 962 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 963 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 964 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 965 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 966 EPI(EPI) {} 967 968 // Override this function to handle the more complex control flow around the 969 // three loops. 970 BasicBlock *createVectorizedLoopSkeleton() final override { 971 return createEpilogueVectorizedLoopSkeleton(); 972 } 973 974 /// The interface for creating a vectorized skeleton using one of two 975 /// different strategies, each corresponding to one execution of the vplan 976 /// as described above. 977 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 978 979 /// Holds and updates state information required to vectorize the main loop 980 /// and its epilogue in two separate passes. This setup helps us avoid 981 /// regenerating and recomputing runtime safety checks. It also helps us to 982 /// shorten the iteration-count-check path length for the cases where the 983 /// iteration count of the loop is so small that the main vector loop is 984 /// completely skipped. 985 EpilogueLoopVectorizationInfo &EPI; 986 }; 987 988 /// A specialized derived class of inner loop vectorizer that performs 989 /// vectorization of *main* loops in the process of vectorizing loops and their 990 /// epilogues. 991 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 992 public: 993 EpilogueVectorizerMainLoop( 994 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 995 DominatorTree *DT, const TargetLibraryInfo *TLI, 996 const TargetTransformInfo *TTI, AssumptionCache *AC, 997 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 998 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 999 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1000 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1001 EPI, LVL, CM, BFI, PSI) {} 1002 /// Implements the interface for creating a vectorized skeleton using the 1003 /// *main loop* strategy (ie the first pass of vplan execution). 1004 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1005 1006 protected: 1007 /// Emits an iteration count bypass check once for the main loop (when \p 1008 /// ForEpilogue is false) and once for the epilogue loop (when \p 1009 /// ForEpilogue is true). 1010 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1011 bool ForEpilogue); 1012 void printDebugTracesAtStart() override; 1013 void printDebugTracesAtEnd() override; 1014 }; 1015 1016 // A specialized derived class of inner loop vectorizer that performs 1017 // vectorization of *epilogue* loops in the process of vectorizing loops and 1018 // their epilogues. 1019 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1020 public: 1021 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1022 LoopInfo *LI, DominatorTree *DT, 1023 const TargetLibraryInfo *TLI, 1024 const TargetTransformInfo *TTI, AssumptionCache *AC, 1025 OptimizationRemarkEmitter *ORE, 1026 EpilogueLoopVectorizationInfo &EPI, 1027 LoopVectorizationLegality *LVL, 1028 llvm::LoopVectorizationCostModel *CM, 1029 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1030 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1031 EPI, LVL, CM, BFI, PSI) {} 1032 /// Implements the interface for creating a vectorized skeleton using the 1033 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1034 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1035 1036 protected: 1037 /// Emits an iteration count bypass check after the main vector loop has 1038 /// finished to see if there are any iterations left to execute by either 1039 /// the vector epilogue or the scalar epilogue. 1040 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1041 BasicBlock *Bypass, 1042 BasicBlock *Insert); 1043 void printDebugTracesAtStart() override; 1044 void printDebugTracesAtEnd() override; 1045 }; 1046 } // end namespace llvm 1047 1048 /// Look for a meaningful debug location on the instruction or it's 1049 /// operands. 1050 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1051 if (!I) 1052 return I; 1053 1054 DebugLoc Empty; 1055 if (I->getDebugLoc() != Empty) 1056 return I; 1057 1058 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1059 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1060 if (OpInst->getDebugLoc() != Empty) 1061 return OpInst; 1062 } 1063 1064 return I; 1065 } 1066 1067 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1068 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1069 const DILocation *DIL = Inst->getDebugLoc(); 1070 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1071 !isa<DbgInfoIntrinsic>(Inst)) { 1072 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1073 auto NewDIL = 1074 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1075 if (NewDIL) 1076 B.SetCurrentDebugLocation(NewDIL.getValue()); 1077 else 1078 LLVM_DEBUG(dbgs() 1079 << "Failed to create new discriminator: " 1080 << DIL->getFilename() << " Line: " << DIL->getLine()); 1081 } 1082 else 1083 B.SetCurrentDebugLocation(DIL); 1084 } else 1085 B.SetCurrentDebugLocation(DebugLoc()); 1086 } 1087 1088 /// Write a record \p DebugMsg about vectorization failure to the debug 1089 /// output stream. If \p I is passed, it is an instruction that prevents 1090 /// vectorization. 1091 #ifndef NDEBUG 1092 static void debugVectorizationFailure(const StringRef DebugMsg, 1093 Instruction *I) { 1094 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1095 if (I != nullptr) 1096 dbgs() << " " << *I; 1097 else 1098 dbgs() << '.'; 1099 dbgs() << '\n'; 1100 } 1101 #endif 1102 1103 /// Create an analysis remark that explains why vectorization failed 1104 /// 1105 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1106 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1107 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1108 /// the location of the remark. \return the remark object that can be 1109 /// streamed to. 1110 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1111 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1112 Value *CodeRegion = TheLoop->getHeader(); 1113 DebugLoc DL = TheLoop->getStartLoc(); 1114 1115 if (I) { 1116 CodeRegion = I->getParent(); 1117 // If there is no debug location attached to the instruction, revert back to 1118 // using the loop's. 1119 if (I->getDebugLoc()) 1120 DL = I->getDebugLoc(); 1121 } 1122 1123 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1124 R << "loop not vectorized: "; 1125 return R; 1126 } 1127 1128 /// Return a value for Step multiplied by VF. 1129 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1130 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1131 Constant *StepVal = ConstantInt::get( 1132 Step->getType(), 1133 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1134 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1135 } 1136 1137 namespace llvm { 1138 1139 void reportVectorizationFailure(const StringRef DebugMsg, 1140 const StringRef OREMsg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1142 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1143 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1144 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1145 ORETag, TheLoop, I) << OREMsg); 1146 } 1147 1148 } // end namespace llvm 1149 1150 #ifndef NDEBUG 1151 /// \return string containing a file name and a line # for the given loop. 1152 static std::string getDebugLocString(const Loop *L) { 1153 std::string Result; 1154 if (L) { 1155 raw_string_ostream OS(Result); 1156 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1157 LoopDbgLoc.print(OS); 1158 else 1159 // Just print the module name. 1160 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1161 OS.flush(); 1162 } 1163 return Result; 1164 } 1165 #endif 1166 1167 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1168 const Instruction *Orig) { 1169 // If the loop was versioned with memchecks, add the corresponding no-alias 1170 // metadata. 1171 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1172 LVer->annotateInstWithNoAlias(To, Orig); 1173 } 1174 1175 void InnerLoopVectorizer::addMetadata(Instruction *To, 1176 Instruction *From) { 1177 propagateMetadata(To, From); 1178 addNewMetadata(To, From); 1179 } 1180 1181 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1182 Instruction *From) { 1183 for (Value *V : To) { 1184 if (Instruction *I = dyn_cast<Instruction>(V)) 1185 addMetadata(I, From); 1186 } 1187 } 1188 1189 namespace llvm { 1190 1191 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1192 // lowered. 1193 enum ScalarEpilogueLowering { 1194 1195 // The default: allowing scalar epilogues. 1196 CM_ScalarEpilogueAllowed, 1197 1198 // Vectorization with OptForSize: don't allow epilogues. 1199 CM_ScalarEpilogueNotAllowedOptSize, 1200 1201 // A special case of vectorisation with OptForSize: loops with a very small 1202 // trip count are considered for vectorization under OptForSize, thereby 1203 // making sure the cost of their loop body is dominant, free of runtime 1204 // guards and scalar iteration overheads. 1205 CM_ScalarEpilogueNotAllowedLowTripLoop, 1206 1207 // Loop hint predicate indicating an epilogue is undesired. 1208 CM_ScalarEpilogueNotNeededUsePredicate, 1209 1210 // Directive indicating we must either tail fold or not vectorize 1211 CM_ScalarEpilogueNotAllowedUsePredicate 1212 }; 1213 1214 /// LoopVectorizationCostModel - estimates the expected speedups due to 1215 /// vectorization. 1216 /// In many cases vectorization is not profitable. This can happen because of 1217 /// a number of reasons. In this class we mainly attempt to predict the 1218 /// expected speedup/slowdowns due to the supported instruction set. We use the 1219 /// TargetTransformInfo to query the different backends for the cost of 1220 /// different operations. 1221 class LoopVectorizationCostModel { 1222 public: 1223 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1224 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1225 LoopVectorizationLegality *Legal, 1226 const TargetTransformInfo &TTI, 1227 const TargetLibraryInfo *TLI, DemandedBits *DB, 1228 AssumptionCache *AC, 1229 OptimizationRemarkEmitter *ORE, const Function *F, 1230 const LoopVectorizeHints *Hints, 1231 InterleavedAccessInfo &IAI) 1232 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1233 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1234 Hints(Hints), InterleaveInfo(IAI) {} 1235 1236 /// \return An upper bound for the vectorization factor, or None if 1237 /// vectorization and interleaving should be avoided up front. 1238 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1239 1240 /// \return True if runtime checks are required for vectorization, and false 1241 /// otherwise. 1242 bool runtimeChecksRequired(); 1243 1244 /// \return The most profitable vectorization factor and the cost of that VF. 1245 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1246 /// then this vectorization factor will be selected if vectorization is 1247 /// possible. 1248 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1249 VectorizationFactor 1250 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1251 const LoopVectorizationPlanner &LVP); 1252 1253 /// Setup cost-based decisions for user vectorization factor. 1254 void selectUserVectorizationFactor(ElementCount UserVF) { 1255 collectUniformsAndScalars(UserVF); 1256 collectInstsToScalarize(UserVF); 1257 } 1258 1259 /// \return The size (in bits) of the smallest and widest types in the code 1260 /// that needs to be vectorized. We ignore values that remain scalar such as 1261 /// 64 bit loop indices. 1262 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1263 1264 /// \return The desired interleave count. 1265 /// If interleave count has been specified by metadata it will be returned. 1266 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1267 /// are the selected vectorization factor and the cost of the selected VF. 1268 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1269 1270 /// Memory access instruction may be vectorized in more than one way. 1271 /// Form of instruction after vectorization depends on cost. 1272 /// This function takes cost-based decisions for Load/Store instructions 1273 /// and collects them in a map. This decisions map is used for building 1274 /// the lists of loop-uniform and loop-scalar instructions. 1275 /// The calculated cost is saved with widening decision in order to 1276 /// avoid redundant calculations. 1277 void setCostBasedWideningDecision(ElementCount VF); 1278 1279 /// A struct that represents some properties of the register usage 1280 /// of a loop. 1281 struct RegisterUsage { 1282 /// Holds the number of loop invariant values that are used in the loop. 1283 /// The key is ClassID of target-provided register class. 1284 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1285 /// Holds the maximum number of concurrent live intervals in the loop. 1286 /// The key is ClassID of target-provided register class. 1287 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1288 }; 1289 1290 /// \return Returns information about the register usages of the loop for the 1291 /// given vectorization factors. 1292 SmallVector<RegisterUsage, 8> 1293 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1294 1295 /// Collect values we want to ignore in the cost model. 1296 void collectValuesToIgnore(); 1297 1298 /// Split reductions into those that happen in the loop, and those that happen 1299 /// outside. In loop reductions are collected into InLoopReductionChains. 1300 void collectInLoopReductions(); 1301 1302 /// \returns The smallest bitwidth each instruction can be represented with. 1303 /// The vector equivalents of these instructions should be truncated to this 1304 /// type. 1305 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1306 return MinBWs; 1307 } 1308 1309 /// \returns True if it is more profitable to scalarize instruction \p I for 1310 /// vectorization factor \p VF. 1311 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1312 assert(VF.isVector() && 1313 "Profitable to scalarize relevant only for VF > 1."); 1314 1315 // Cost model is not run in the VPlan-native path - return conservative 1316 // result until this changes. 1317 if (EnableVPlanNativePath) 1318 return false; 1319 1320 auto Scalars = InstsToScalarize.find(VF); 1321 assert(Scalars != InstsToScalarize.end() && 1322 "VF not yet analyzed for scalarization profitability"); 1323 return Scalars->second.find(I) != Scalars->second.end(); 1324 } 1325 1326 /// Returns true if \p I is known to be uniform after vectorization. 1327 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1328 if (VF.isScalar()) 1329 return true; 1330 1331 // Cost model is not run in the VPlan-native path - return conservative 1332 // result until this changes. 1333 if (EnableVPlanNativePath) 1334 return false; 1335 1336 auto UniformsPerVF = Uniforms.find(VF); 1337 assert(UniformsPerVF != Uniforms.end() && 1338 "VF not yet analyzed for uniformity"); 1339 return UniformsPerVF->second.count(I); 1340 } 1341 1342 /// Returns true if \p I is known to be scalar after vectorization. 1343 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1344 if (VF.isScalar()) 1345 return true; 1346 1347 // Cost model is not run in the VPlan-native path - return conservative 1348 // result until this changes. 1349 if (EnableVPlanNativePath) 1350 return false; 1351 1352 auto ScalarsPerVF = Scalars.find(VF); 1353 assert(ScalarsPerVF != Scalars.end() && 1354 "Scalar values are not calculated for VF"); 1355 return ScalarsPerVF->second.count(I); 1356 } 1357 1358 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1359 /// for vectorization factor \p VF. 1360 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1361 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1362 !isProfitableToScalarize(I, VF) && 1363 !isScalarAfterVectorization(I, VF); 1364 } 1365 1366 /// Decision that was taken during cost calculation for memory instruction. 1367 enum InstWidening { 1368 CM_Unknown, 1369 CM_Widen, // For consecutive accesses with stride +1. 1370 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1371 CM_Interleave, 1372 CM_GatherScatter, 1373 CM_Scalarize 1374 }; 1375 1376 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1377 /// instruction \p I and vector width \p VF. 1378 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1379 unsigned Cost) { 1380 assert(VF.isVector() && "Expected VF >=2"); 1381 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1382 } 1383 1384 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1385 /// interleaving group \p Grp and vector width \p VF. 1386 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1387 ElementCount VF, InstWidening W, unsigned Cost) { 1388 assert(VF.isVector() && "Expected VF >=2"); 1389 /// Broadcast this decicion to all instructions inside the group. 1390 /// But the cost will be assigned to one instruction only. 1391 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1392 if (auto *I = Grp->getMember(i)) { 1393 if (Grp->getInsertPos() == I) 1394 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1395 else 1396 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1397 } 1398 } 1399 } 1400 1401 /// Return the cost model decision for the given instruction \p I and vector 1402 /// width \p VF. Return CM_Unknown if this instruction did not pass 1403 /// through the cost modeling. 1404 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1405 assert(VF.isVector() && "Expected VF to be a vector VF"); 1406 // Cost model is not run in the VPlan-native path - return conservative 1407 // result until this changes. 1408 if (EnableVPlanNativePath) 1409 return CM_GatherScatter; 1410 1411 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1412 auto Itr = WideningDecisions.find(InstOnVF); 1413 if (Itr == WideningDecisions.end()) 1414 return CM_Unknown; 1415 return Itr->second.first; 1416 } 1417 1418 /// Return the vectorization cost for the given instruction \p I and vector 1419 /// width \p VF. 1420 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1421 assert(VF.isVector() && "Expected VF >=2"); 1422 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1423 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1424 "The cost is not calculated"); 1425 return WideningDecisions[InstOnVF].second; 1426 } 1427 1428 /// Return True if instruction \p I is an optimizable truncate whose operand 1429 /// is an induction variable. Such a truncate will be removed by adding a new 1430 /// induction variable with the destination type. 1431 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1432 // If the instruction is not a truncate, return false. 1433 auto *Trunc = dyn_cast<TruncInst>(I); 1434 if (!Trunc) 1435 return false; 1436 1437 // Get the source and destination types of the truncate. 1438 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1439 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1440 1441 // If the truncate is free for the given types, return false. Replacing a 1442 // free truncate with an induction variable would add an induction variable 1443 // update instruction to each iteration of the loop. We exclude from this 1444 // check the primary induction variable since it will need an update 1445 // instruction regardless. 1446 Value *Op = Trunc->getOperand(0); 1447 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1448 return false; 1449 1450 // If the truncated value is not an induction variable, return false. 1451 return Legal->isInductionPhi(Op); 1452 } 1453 1454 /// Collects the instructions to scalarize for each predicated instruction in 1455 /// the loop. 1456 void collectInstsToScalarize(ElementCount VF); 1457 1458 /// Collect Uniform and Scalar values for the given \p VF. 1459 /// The sets depend on CM decision for Load/Store instructions 1460 /// that may be vectorized as interleave, gather-scatter or scalarized. 1461 void collectUniformsAndScalars(ElementCount VF) { 1462 // Do the analysis once. 1463 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1464 return; 1465 setCostBasedWideningDecision(VF); 1466 collectLoopUniforms(VF); 1467 collectLoopScalars(VF); 1468 } 1469 1470 /// Returns true if the target machine supports masked store operation 1471 /// for the given \p DataType and kind of access to \p Ptr. 1472 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1473 return Legal->isConsecutivePtr(Ptr) && 1474 TTI.isLegalMaskedStore(DataType, Alignment); 1475 } 1476 1477 /// Returns true if the target machine supports masked load operation 1478 /// for the given \p DataType and kind of access to \p Ptr. 1479 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1480 return Legal->isConsecutivePtr(Ptr) && 1481 TTI.isLegalMaskedLoad(DataType, Alignment); 1482 } 1483 1484 /// Returns true if the target machine supports masked scatter operation 1485 /// for the given \p DataType. 1486 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1487 return TTI.isLegalMaskedScatter(DataType, Alignment); 1488 } 1489 1490 /// Returns true if the target machine supports masked gather operation 1491 /// for the given \p DataType. 1492 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1493 return TTI.isLegalMaskedGather(DataType, Alignment); 1494 } 1495 1496 /// Returns true if the target machine can represent \p V as a masked gather 1497 /// or scatter operation. 1498 bool isLegalGatherOrScatter(Value *V) { 1499 bool LI = isa<LoadInst>(V); 1500 bool SI = isa<StoreInst>(V); 1501 if (!LI && !SI) 1502 return false; 1503 auto *Ty = getMemInstValueType(V); 1504 Align Align = getLoadStoreAlignment(V); 1505 return (LI && isLegalMaskedGather(Ty, Align)) || 1506 (SI && isLegalMaskedScatter(Ty, Align)); 1507 } 1508 1509 /// Returns true if \p I is an instruction that will be scalarized with 1510 /// predication. Such instructions include conditional stores and 1511 /// instructions that may divide by zero. 1512 /// If a non-zero VF has been calculated, we check if I will be scalarized 1513 /// predication for that VF. 1514 bool isScalarWithPredication(Instruction *I, 1515 ElementCount VF = ElementCount::getFixed(1)); 1516 1517 // Returns true if \p I is an instruction that will be predicated either 1518 // through scalar predication or masked load/store or masked gather/scatter. 1519 // Superset of instructions that return true for isScalarWithPredication. 1520 bool isPredicatedInst(Instruction *I) { 1521 if (!blockNeedsPredication(I->getParent())) 1522 return false; 1523 // Loads and stores that need some form of masked operation are predicated 1524 // instructions. 1525 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1526 return Legal->isMaskRequired(I); 1527 return isScalarWithPredication(I); 1528 } 1529 1530 /// Returns true if \p I is a memory instruction with consecutive memory 1531 /// access that can be widened. 1532 bool 1533 memoryInstructionCanBeWidened(Instruction *I, 1534 ElementCount VF = ElementCount::getFixed(1)); 1535 1536 /// Returns true if \p I is a memory instruction in an interleaved-group 1537 /// of memory accesses that can be vectorized with wide vector loads/stores 1538 /// and shuffles. 1539 bool 1540 interleavedAccessCanBeWidened(Instruction *I, 1541 ElementCount VF = ElementCount::getFixed(1)); 1542 1543 /// Check if \p Instr belongs to any interleaved access group. 1544 bool isAccessInterleaved(Instruction *Instr) { 1545 return InterleaveInfo.isInterleaved(Instr); 1546 } 1547 1548 /// Get the interleaved access group that \p Instr belongs to. 1549 const InterleaveGroup<Instruction> * 1550 getInterleavedAccessGroup(Instruction *Instr) { 1551 return InterleaveInfo.getInterleaveGroup(Instr); 1552 } 1553 1554 /// Returns true if we're required to use a scalar epilogue for at least 1555 /// the final iteration of the original loop. 1556 bool requiresScalarEpilogue() const { 1557 if (!isScalarEpilogueAllowed()) 1558 return false; 1559 // If we might exit from anywhere but the latch, must run the exiting 1560 // iteration in scalar form. 1561 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1562 return true; 1563 return InterleaveInfo.requiresScalarEpilogue(); 1564 } 1565 1566 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1567 /// loop hint annotation. 1568 bool isScalarEpilogueAllowed() const { 1569 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1570 } 1571 1572 /// Returns true if all loop blocks should be masked to fold tail loop. 1573 bool foldTailByMasking() const { return FoldTailByMasking; } 1574 1575 bool blockNeedsPredication(BasicBlock *BB) { 1576 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1577 } 1578 1579 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1580 /// nodes to the chain of instructions representing the reductions. Uses a 1581 /// MapVector to ensure deterministic iteration order. 1582 using ReductionChainMap = 1583 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1584 1585 /// Return the chain of instructions representing an inloop reduction. 1586 const ReductionChainMap &getInLoopReductionChains() const { 1587 return InLoopReductionChains; 1588 } 1589 1590 /// Returns true if the Phi is part of an inloop reduction. 1591 bool isInLoopReduction(PHINode *Phi) const { 1592 return InLoopReductionChains.count(Phi); 1593 } 1594 1595 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1596 /// with factor VF. Return the cost of the instruction, including 1597 /// scalarization overhead if it's needed. 1598 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1599 1600 /// Estimate cost of a call instruction CI if it were vectorized with factor 1601 /// VF. Return the cost of the instruction, including scalarization overhead 1602 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1603 /// scalarized - 1604 /// i.e. either vector version isn't available, or is too expensive. 1605 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1606 bool &NeedToScalarize); 1607 1608 /// Invalidates decisions already taken by the cost model. 1609 void invalidateCostModelingDecisions() { 1610 WideningDecisions.clear(); 1611 Uniforms.clear(); 1612 Scalars.clear(); 1613 } 1614 1615 private: 1616 unsigned NumPredStores = 0; 1617 1618 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1619 /// than zero. One is returned if vectorization should best be avoided due 1620 /// to cost. 1621 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1622 ElementCount UserVF); 1623 1624 /// The vectorization cost is a combination of the cost itself and a boolean 1625 /// indicating whether any of the contributing operations will actually 1626 /// operate on 1627 /// vector values after type legalization in the backend. If this latter value 1628 /// is 1629 /// false, then all operations will be scalarized (i.e. no vectorization has 1630 /// actually taken place). 1631 using VectorizationCostTy = std::pair<unsigned, bool>; 1632 1633 /// Returns the expected execution cost. The unit of the cost does 1634 /// not matter because we use the 'cost' units to compare different 1635 /// vector widths. The cost that is returned is *not* normalized by 1636 /// the factor width. 1637 VectorizationCostTy expectedCost(ElementCount VF); 1638 1639 /// Returns the execution time cost of an instruction for a given vector 1640 /// width. Vector width of one means scalar. 1641 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1642 1643 /// The cost-computation logic from getInstructionCost which provides 1644 /// the vector type as an output parameter. 1645 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1646 1647 /// Calculate vectorization cost of memory instruction \p I. 1648 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1649 1650 /// The cost computation for scalarized memory instruction. 1651 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost computation for interleaving group of memory instructions. 1654 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1655 1656 /// The cost computation for Gather/Scatter instruction. 1657 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1658 1659 /// The cost computation for widening instruction \p I with consecutive 1660 /// memory access. 1661 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1662 1663 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1664 /// Load: scalar load + broadcast. 1665 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1666 /// element) 1667 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1668 1669 /// Estimate the overhead of scalarizing an instruction. This is a 1670 /// convenience wrapper for the type-based getScalarizationOverhead API. 1671 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1672 1673 /// Returns whether the instruction is a load or store and will be a emitted 1674 /// as a vector operation. 1675 bool isConsecutiveLoadOrStore(Instruction *I); 1676 1677 /// Returns true if an artificially high cost for emulated masked memrefs 1678 /// should be used. 1679 bool useEmulatedMaskMemRefHack(Instruction *I); 1680 1681 /// Map of scalar integer values to the smallest bitwidth they can be legally 1682 /// represented as. The vector equivalents of these values should be truncated 1683 /// to this type. 1684 MapVector<Instruction *, uint64_t> MinBWs; 1685 1686 /// A type representing the costs for instructions if they were to be 1687 /// scalarized rather than vectorized. The entries are Instruction-Cost 1688 /// pairs. 1689 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1690 1691 /// A set containing all BasicBlocks that are known to present after 1692 /// vectorization as a predicated block. 1693 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1694 1695 /// Records whether it is allowed to have the original scalar loop execute at 1696 /// least once. This may be needed as a fallback loop in case runtime 1697 /// aliasing/dependence checks fail, or to handle the tail/remainder 1698 /// iterations when the trip count is unknown or doesn't divide by the VF, 1699 /// or as a peel-loop to handle gaps in interleave-groups. 1700 /// Under optsize and when the trip count is very small we don't allow any 1701 /// iterations to execute in the scalar loop. 1702 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1703 1704 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1705 bool FoldTailByMasking = false; 1706 1707 /// A map holding scalar costs for different vectorization factors. The 1708 /// presence of a cost for an instruction in the mapping indicates that the 1709 /// instruction will be scalarized when vectorizing with the associated 1710 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1711 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1712 1713 /// Holds the instructions known to be uniform after vectorization. 1714 /// The data is collected per VF. 1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1716 1717 /// Holds the instructions known to be scalar after vectorization. 1718 /// The data is collected per VF. 1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1720 1721 /// Holds the instructions (address computations) that are forced to be 1722 /// scalarized. 1723 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1724 1725 /// PHINodes of the reductions that should be expanded in-loop along with 1726 /// their associated chains of reduction operations, in program order from top 1727 /// (PHI) to bottom 1728 ReductionChainMap InLoopReductionChains; 1729 1730 /// Returns the expected difference in cost from scalarizing the expression 1731 /// feeding a predicated instruction \p PredInst. The instructions to 1732 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1733 /// non-negative return value implies the expression will be scalarized. 1734 /// Currently, only single-use chains are considered for scalarization. 1735 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1736 ElementCount VF); 1737 1738 /// Collect the instructions that are uniform after vectorization. An 1739 /// instruction is uniform if we represent it with a single scalar value in 1740 /// the vectorized loop corresponding to each vector iteration. Examples of 1741 /// uniform instructions include pointer operands of consecutive or 1742 /// interleaved memory accesses. Note that although uniformity implies an 1743 /// instruction will be scalar, the reverse is not true. In general, a 1744 /// scalarized instruction will be represented by VF scalar values in the 1745 /// vectorized loop, each corresponding to an iteration of the original 1746 /// scalar loop. 1747 void collectLoopUniforms(ElementCount VF); 1748 1749 /// Collect the instructions that are scalar after vectorization. An 1750 /// instruction is scalar if it is known to be uniform or will be scalarized 1751 /// during vectorization. Non-uniform scalarized instructions will be 1752 /// represented by VF values in the vectorized loop, each corresponding to an 1753 /// iteration of the original scalar loop. 1754 void collectLoopScalars(ElementCount VF); 1755 1756 /// Keeps cost model vectorization decision and cost for instructions. 1757 /// Right now it is used for memory instructions only. 1758 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1759 std::pair<InstWidening, unsigned>>; 1760 1761 DecisionList WideningDecisions; 1762 1763 /// Returns true if \p V is expected to be vectorized and it needs to be 1764 /// extracted. 1765 bool needsExtract(Value *V, ElementCount VF) const { 1766 Instruction *I = dyn_cast<Instruction>(V); 1767 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1768 TheLoop->isLoopInvariant(I)) 1769 return false; 1770 1771 // Assume we can vectorize V (and hence we need extraction) if the 1772 // scalars are not computed yet. This can happen, because it is called 1773 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1774 // the scalars are collected. That should be a safe assumption in most 1775 // cases, because we check if the operands have vectorizable types 1776 // beforehand in LoopVectorizationLegality. 1777 return Scalars.find(VF) == Scalars.end() || 1778 !isScalarAfterVectorization(I, VF); 1779 }; 1780 1781 /// Returns a range containing only operands needing to be extracted. 1782 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1783 ElementCount VF) { 1784 return SmallVector<Value *, 4>(make_filter_range( 1785 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1786 } 1787 1788 /// Determines if we have the infrastructure to vectorize loop \p L and its 1789 /// epilogue, assuming the main loop is vectorized by \p VF. 1790 bool isCandidateForEpilogueVectorization(const Loop &L, 1791 const ElementCount VF) const; 1792 1793 /// Returns true if epilogue vectorization is considered profitable, and 1794 /// false otherwise. 1795 /// \p VF is the vectorization factor chosen for the original loop. 1796 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1797 1798 public: 1799 /// The loop that we evaluate. 1800 Loop *TheLoop; 1801 1802 /// Predicated scalar evolution analysis. 1803 PredicatedScalarEvolution &PSE; 1804 1805 /// Loop Info analysis. 1806 LoopInfo *LI; 1807 1808 /// Vectorization legality. 1809 LoopVectorizationLegality *Legal; 1810 1811 /// Vector target information. 1812 const TargetTransformInfo &TTI; 1813 1814 /// Target Library Info. 1815 const TargetLibraryInfo *TLI; 1816 1817 /// Demanded bits analysis. 1818 DemandedBits *DB; 1819 1820 /// Assumption cache. 1821 AssumptionCache *AC; 1822 1823 /// Interface to emit optimization remarks. 1824 OptimizationRemarkEmitter *ORE; 1825 1826 const Function *TheFunction; 1827 1828 /// Loop Vectorize Hint. 1829 const LoopVectorizeHints *Hints; 1830 1831 /// The interleave access information contains groups of interleaved accesses 1832 /// with the same stride and close to each other. 1833 InterleavedAccessInfo &InterleaveInfo; 1834 1835 /// Values to ignore in the cost model. 1836 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1837 1838 /// Values to ignore in the cost model when VF > 1. 1839 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1840 1841 /// Profitable vector factors. 1842 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1843 }; 1844 1845 } // end namespace llvm 1846 1847 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1848 // vectorization. The loop needs to be annotated with #pragma omp simd 1849 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1850 // vector length information is not provided, vectorization is not considered 1851 // explicit. Interleave hints are not allowed either. These limitations will be 1852 // relaxed in the future. 1853 // Please, note that we are currently forced to abuse the pragma 'clang 1854 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1855 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1856 // provides *explicit vectorization hints* (LV can bypass legal checks and 1857 // assume that vectorization is legal). However, both hints are implemented 1858 // using the same metadata (llvm.loop.vectorize, processed by 1859 // LoopVectorizeHints). This will be fixed in the future when the native IR 1860 // representation for pragma 'omp simd' is introduced. 1861 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1862 OptimizationRemarkEmitter *ORE) { 1863 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1864 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1865 1866 // Only outer loops with an explicit vectorization hint are supported. 1867 // Unannotated outer loops are ignored. 1868 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1869 return false; 1870 1871 Function *Fn = OuterLp->getHeader()->getParent(); 1872 if (!Hints.allowVectorization(Fn, OuterLp, 1873 true /*VectorizeOnlyWhenForced*/)) { 1874 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1875 return false; 1876 } 1877 1878 if (Hints.getInterleave() > 1) { 1879 // TODO: Interleave support is future work. 1880 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1881 "outer loops.\n"); 1882 Hints.emitRemarkWithHints(); 1883 return false; 1884 } 1885 1886 return true; 1887 } 1888 1889 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1890 OptimizationRemarkEmitter *ORE, 1891 SmallVectorImpl<Loop *> &V) { 1892 // Collect inner loops and outer loops without irreducible control flow. For 1893 // now, only collect outer loops that have explicit vectorization hints. If we 1894 // are stress testing the VPlan H-CFG construction, we collect the outermost 1895 // loop of every loop nest. 1896 if (L.isInnermost() || VPlanBuildStressTest || 1897 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1898 LoopBlocksRPO RPOT(&L); 1899 RPOT.perform(LI); 1900 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1901 V.push_back(&L); 1902 // TODO: Collect inner loops inside marked outer loops in case 1903 // vectorization fails for the outer loop. Do not invoke 1904 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1905 // already known to be reducible. We can use an inherited attribute for 1906 // that. 1907 return; 1908 } 1909 } 1910 for (Loop *InnerL : L) 1911 collectSupportedLoops(*InnerL, LI, ORE, V); 1912 } 1913 1914 namespace { 1915 1916 /// The LoopVectorize Pass. 1917 struct LoopVectorize : public FunctionPass { 1918 /// Pass identification, replacement for typeid 1919 static char ID; 1920 1921 LoopVectorizePass Impl; 1922 1923 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1924 bool VectorizeOnlyWhenForced = false) 1925 : FunctionPass(ID), 1926 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1927 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1928 } 1929 1930 bool runOnFunction(Function &F) override { 1931 if (skipFunction(F)) 1932 return false; 1933 1934 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1935 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1936 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1937 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1938 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1939 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1940 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1941 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1942 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1943 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1944 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1945 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1946 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1947 1948 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1949 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1950 1951 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1952 GetLAA, *ORE, PSI).MadeAnyChange; 1953 } 1954 1955 void getAnalysisUsage(AnalysisUsage &AU) const override { 1956 AU.addRequired<AssumptionCacheTracker>(); 1957 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1958 AU.addRequired<DominatorTreeWrapperPass>(); 1959 AU.addRequired<LoopInfoWrapperPass>(); 1960 AU.addRequired<ScalarEvolutionWrapperPass>(); 1961 AU.addRequired<TargetTransformInfoWrapperPass>(); 1962 AU.addRequired<AAResultsWrapperPass>(); 1963 AU.addRequired<LoopAccessLegacyAnalysis>(); 1964 AU.addRequired<DemandedBitsWrapperPass>(); 1965 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1966 AU.addRequired<InjectTLIMappingsLegacy>(); 1967 1968 // We currently do not preserve loopinfo/dominator analyses with outer loop 1969 // vectorization. Until this is addressed, mark these analyses as preserved 1970 // only for non-VPlan-native path. 1971 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1972 if (!EnableVPlanNativePath) { 1973 AU.addPreserved<LoopInfoWrapperPass>(); 1974 AU.addPreserved<DominatorTreeWrapperPass>(); 1975 } 1976 1977 AU.addPreserved<BasicAAWrapperPass>(); 1978 AU.addPreserved<GlobalsAAWrapperPass>(); 1979 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1980 } 1981 }; 1982 1983 } // end anonymous namespace 1984 1985 //===----------------------------------------------------------------------===// 1986 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1987 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1988 //===----------------------------------------------------------------------===// 1989 1990 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1991 // We need to place the broadcast of invariant variables outside the loop, 1992 // but only if it's proven safe to do so. Else, broadcast will be inside 1993 // vector loop body. 1994 Instruction *Instr = dyn_cast<Instruction>(V); 1995 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1996 (!Instr || 1997 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1998 // Place the code for broadcasting invariant variables in the new preheader. 1999 IRBuilder<>::InsertPointGuard Guard(Builder); 2000 if (SafeToHoist) 2001 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2002 2003 // Broadcast the scalar into all locations in the vector. 2004 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2005 2006 return Shuf; 2007 } 2008 2009 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2010 const InductionDescriptor &II, Value *Step, Value *Start, 2011 Instruction *EntryVal) { 2012 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2013 "Expected either an induction phi-node or a truncate of it!"); 2014 2015 // Construct the initial value of the vector IV in the vector loop preheader 2016 auto CurrIP = Builder.saveIP(); 2017 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2018 if (isa<TruncInst>(EntryVal)) { 2019 assert(Start->getType()->isIntegerTy() && 2020 "Truncation requires an integer type"); 2021 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2022 Step = Builder.CreateTrunc(Step, TruncType); 2023 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2024 } 2025 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2026 Value *SteppedStart = 2027 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2028 2029 // We create vector phi nodes for both integer and floating-point induction 2030 // variables. Here, we determine the kind of arithmetic we will perform. 2031 Instruction::BinaryOps AddOp; 2032 Instruction::BinaryOps MulOp; 2033 if (Step->getType()->isIntegerTy()) { 2034 AddOp = Instruction::Add; 2035 MulOp = Instruction::Mul; 2036 } else { 2037 AddOp = II.getInductionOpcode(); 2038 MulOp = Instruction::FMul; 2039 } 2040 2041 // Multiply the vectorization factor by the step using integer or 2042 // floating-point arithmetic as appropriate. 2043 Value *ConstVF = 2044 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2045 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2046 2047 // Create a vector splat to use in the induction update. 2048 // 2049 // FIXME: If the step is non-constant, we create the vector splat with 2050 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2051 // handle a constant vector splat. 2052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2053 Value *SplatVF = isa<Constant>(Mul) 2054 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2055 : Builder.CreateVectorSplat(VF, Mul); 2056 Builder.restoreIP(CurrIP); 2057 2058 // We may need to add the step a number of times, depending on the unroll 2059 // factor. The last of those goes into the PHI. 2060 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2061 &*LoopVectorBody->getFirstInsertionPt()); 2062 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2063 Instruction *LastInduction = VecInd; 2064 for (unsigned Part = 0; Part < UF; ++Part) { 2065 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2066 2067 if (isa<TruncInst>(EntryVal)) 2068 addMetadata(LastInduction, EntryVal); 2069 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2070 2071 LastInduction = cast<Instruction>(addFastMathFlag( 2072 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2073 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2074 } 2075 2076 // Move the last step to the end of the latch block. This ensures consistent 2077 // placement of all induction updates. 2078 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2079 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2080 auto *ICmp = cast<Instruction>(Br->getCondition()); 2081 LastInduction->moveBefore(ICmp); 2082 LastInduction->setName("vec.ind.next"); 2083 2084 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2085 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2086 } 2087 2088 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2089 return Cost->isScalarAfterVectorization(I, VF) || 2090 Cost->isProfitableToScalarize(I, VF); 2091 } 2092 2093 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2094 if (shouldScalarizeInstruction(IV)) 2095 return true; 2096 auto isScalarInst = [&](User *U) -> bool { 2097 auto *I = cast<Instruction>(U); 2098 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2099 }; 2100 return llvm::any_of(IV->users(), isScalarInst); 2101 } 2102 2103 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2104 const InductionDescriptor &ID, const Instruction *EntryVal, 2105 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2106 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2107 "Expected either an induction phi-node or a truncate of it!"); 2108 2109 // This induction variable is not the phi from the original loop but the 2110 // newly-created IV based on the proof that casted Phi is equal to the 2111 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2112 // re-uses the same InductionDescriptor that original IV uses but we don't 2113 // have to do any recording in this case - that is done when original IV is 2114 // processed. 2115 if (isa<TruncInst>(EntryVal)) 2116 return; 2117 2118 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2119 if (Casts.empty()) 2120 return; 2121 // Only the first Cast instruction in the Casts vector is of interest. 2122 // The rest of the Casts (if exist) have no uses outside the 2123 // induction update chain itself. 2124 Instruction *CastInst = *Casts.begin(); 2125 if (Lane < UINT_MAX) 2126 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2127 else 2128 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2129 } 2130 2131 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2132 TruncInst *Trunc) { 2133 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2134 "Primary induction variable must have an integer type"); 2135 2136 auto II = Legal->getInductionVars().find(IV); 2137 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2138 2139 auto ID = II->second; 2140 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2141 2142 // The value from the original loop to which we are mapping the new induction 2143 // variable. 2144 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2145 2146 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2147 2148 // Generate code for the induction step. Note that induction steps are 2149 // required to be loop-invariant 2150 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2151 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2152 "Induction step should be loop invariant"); 2153 if (PSE.getSE()->isSCEVable(IV->getType())) { 2154 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2155 return Exp.expandCodeFor(Step, Step->getType(), 2156 LoopVectorPreHeader->getTerminator()); 2157 } 2158 return cast<SCEVUnknown>(Step)->getValue(); 2159 }; 2160 2161 // The scalar value to broadcast. This is derived from the canonical 2162 // induction variable. If a truncation type is given, truncate the canonical 2163 // induction variable and step. Otherwise, derive these values from the 2164 // induction descriptor. 2165 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2166 Value *ScalarIV = Induction; 2167 if (IV != OldInduction) { 2168 ScalarIV = IV->getType()->isIntegerTy() 2169 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2170 : Builder.CreateCast(Instruction::SIToFP, Induction, 2171 IV->getType()); 2172 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2173 ScalarIV->setName("offset.idx"); 2174 } 2175 if (Trunc) { 2176 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2177 assert(Step->getType()->isIntegerTy() && 2178 "Truncation requires an integer step"); 2179 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2180 Step = Builder.CreateTrunc(Step, TruncType); 2181 } 2182 return ScalarIV; 2183 }; 2184 2185 // Create the vector values from the scalar IV, in the absence of creating a 2186 // vector IV. 2187 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2188 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2189 for (unsigned Part = 0; Part < UF; ++Part) { 2190 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2191 Value *EntryPart = 2192 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2193 ID.getInductionOpcode()); 2194 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2195 if (Trunc) 2196 addMetadata(EntryPart, Trunc); 2197 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2198 } 2199 }; 2200 2201 // Now do the actual transformations, and start with creating the step value. 2202 Value *Step = CreateStepValue(ID.getStep()); 2203 if (VF.isZero() || VF.isScalar()) { 2204 Value *ScalarIV = CreateScalarIV(Step); 2205 CreateSplatIV(ScalarIV, Step); 2206 return; 2207 } 2208 2209 // Determine if we want a scalar version of the induction variable. This is 2210 // true if the induction variable itself is not widened, or if it has at 2211 // least one user in the loop that is not widened. 2212 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2213 if (!NeedsScalarIV) { 2214 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2215 return; 2216 } 2217 2218 // Try to create a new independent vector induction variable. If we can't 2219 // create the phi node, we will splat the scalar induction variable in each 2220 // loop iteration. 2221 if (!shouldScalarizeInstruction(EntryVal)) { 2222 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2223 Value *ScalarIV = CreateScalarIV(Step); 2224 // Create scalar steps that can be used by instructions we will later 2225 // scalarize. Note that the addition of the scalar steps will not increase 2226 // the number of instructions in the loop in the common case prior to 2227 // InstCombine. We will be trading one vector extract for each scalar step. 2228 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2229 return; 2230 } 2231 2232 // All IV users are scalar instructions, so only emit a scalar IV, not a 2233 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2234 // predicate used by the masked loads/stores. 2235 Value *ScalarIV = CreateScalarIV(Step); 2236 if (!Cost->isScalarEpilogueAllowed()) 2237 CreateSplatIV(ScalarIV, Step); 2238 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2239 } 2240 2241 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2242 Instruction::BinaryOps BinOp) { 2243 // Create and check the types. 2244 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2245 int VLen = ValVTy->getNumElements(); 2246 2247 Type *STy = Val->getType()->getScalarType(); 2248 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2249 "Induction Step must be an integer or FP"); 2250 assert(Step->getType() == STy && "Step has wrong type"); 2251 2252 SmallVector<Constant *, 8> Indices; 2253 2254 if (STy->isIntegerTy()) { 2255 // Create a vector of consecutive numbers from zero to VF. 2256 for (int i = 0; i < VLen; ++i) 2257 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2258 2259 // Add the consecutive indices to the vector value. 2260 Constant *Cv = ConstantVector::get(Indices); 2261 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2262 Step = Builder.CreateVectorSplat(VLen, Step); 2263 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2264 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2265 // which can be found from the original scalar operations. 2266 Step = Builder.CreateMul(Cv, Step); 2267 return Builder.CreateAdd(Val, Step, "induction"); 2268 } 2269 2270 // Floating point induction. 2271 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2272 "Binary Opcode should be specified for FP induction"); 2273 // Create a vector of consecutive numbers from zero to VF. 2274 for (int i = 0; i < VLen; ++i) 2275 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2276 2277 // Add the consecutive indices to the vector value. 2278 Constant *Cv = ConstantVector::get(Indices); 2279 2280 Step = Builder.CreateVectorSplat(VLen, Step); 2281 2282 // Floating point operations had to be 'fast' to enable the induction. 2283 FastMathFlags Flags; 2284 Flags.setFast(); 2285 2286 Value *MulOp = Builder.CreateFMul(Cv, Step); 2287 if (isa<Instruction>(MulOp)) 2288 // Have to check, MulOp may be a constant 2289 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2290 2291 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2292 if (isa<Instruction>(BOp)) 2293 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2294 return BOp; 2295 } 2296 2297 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2298 Instruction *EntryVal, 2299 const InductionDescriptor &ID) { 2300 // We shouldn't have to build scalar steps if we aren't vectorizing. 2301 assert(VF.isVector() && "VF should be greater than one"); 2302 // Get the value type and ensure it and the step have the same integer type. 2303 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2304 assert(ScalarIVTy == Step->getType() && 2305 "Val and Step should have the same type"); 2306 2307 // We build scalar steps for both integer and floating-point induction 2308 // variables. Here, we determine the kind of arithmetic we will perform. 2309 Instruction::BinaryOps AddOp; 2310 Instruction::BinaryOps MulOp; 2311 if (ScalarIVTy->isIntegerTy()) { 2312 AddOp = Instruction::Add; 2313 MulOp = Instruction::Mul; 2314 } else { 2315 AddOp = ID.getInductionOpcode(); 2316 MulOp = Instruction::FMul; 2317 } 2318 2319 // Determine the number of scalars we need to generate for each unroll 2320 // iteration. If EntryVal is uniform, we only need to generate the first 2321 // lane. Otherwise, we generate all VF values. 2322 unsigned Lanes = 2323 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2324 ? 1 2325 : VF.getKnownMinValue(); 2326 assert((!VF.isScalable() || Lanes == 1) && 2327 "Should never scalarize a scalable vector"); 2328 // Compute the scalar steps and save the results in VectorLoopValueMap. 2329 for (unsigned Part = 0; Part < UF; ++Part) { 2330 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2331 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2332 ScalarIVTy->getScalarSizeInBits()); 2333 Value *StartIdx = 2334 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2335 if (ScalarIVTy->isFloatingPointTy()) 2336 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2337 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2338 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2339 // The step returned by `createStepForVF` is a runtime-evaluated value 2340 // when VF is scalable. Otherwise, it should be folded into a Constant. 2341 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2342 "Expected StartIdx to be folded to a constant when VF is not " 2343 "scalable"); 2344 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2345 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2346 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2347 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2348 } 2349 } 2350 } 2351 2352 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2353 assert(V != Induction && "The new induction variable should not be used."); 2354 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2355 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2356 2357 // If we have a stride that is replaced by one, do it here. Defer this for 2358 // the VPlan-native path until we start running Legal checks in that path. 2359 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2360 V = ConstantInt::get(V->getType(), 1); 2361 2362 // If we have a vector mapped to this value, return it. 2363 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2364 return VectorLoopValueMap.getVectorValue(V, Part); 2365 2366 // If the value has not been vectorized, check if it has been scalarized 2367 // instead. If it has been scalarized, and we actually need the value in 2368 // vector form, we will construct the vector values on demand. 2369 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2370 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2371 2372 // If we've scalarized a value, that value should be an instruction. 2373 auto *I = cast<Instruction>(V); 2374 2375 // If we aren't vectorizing, we can just copy the scalar map values over to 2376 // the vector map. 2377 if (VF.isScalar()) { 2378 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2379 return ScalarValue; 2380 } 2381 2382 // Get the last scalar instruction we generated for V and Part. If the value 2383 // is known to be uniform after vectorization, this corresponds to lane zero 2384 // of the Part unroll iteration. Otherwise, the last instruction is the one 2385 // we created for the last vector lane of the Part unroll iteration. 2386 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2387 ? 0 2388 : VF.getKnownMinValue() - 1; 2389 assert((!VF.isScalable() || LastLane == 0) && 2390 "Scalable vectorization can't lead to any scalarized values."); 2391 auto *LastInst = cast<Instruction>( 2392 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2393 2394 // Set the insert point after the last scalarized instruction. This ensures 2395 // the insertelement sequence will directly follow the scalar definitions. 2396 auto OldIP = Builder.saveIP(); 2397 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2398 Builder.SetInsertPoint(&*NewIP); 2399 2400 // However, if we are vectorizing, we need to construct the vector values. 2401 // If the value is known to be uniform after vectorization, we can just 2402 // broadcast the scalar value corresponding to lane zero for each unroll 2403 // iteration. Otherwise, we construct the vector values using insertelement 2404 // instructions. Since the resulting vectors are stored in 2405 // VectorLoopValueMap, we will only generate the insertelements once. 2406 Value *VectorValue = nullptr; 2407 if (Cost->isUniformAfterVectorization(I, VF)) { 2408 VectorValue = getBroadcastInstrs(ScalarValue); 2409 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2410 } else { 2411 // Initialize packing with insertelements to start from poison. 2412 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2413 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2414 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2415 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2416 packScalarIntoVectorValue(V, {Part, Lane}); 2417 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2418 } 2419 Builder.restoreIP(OldIP); 2420 return VectorValue; 2421 } 2422 2423 // If this scalar is unknown, assume that it is a constant or that it is 2424 // loop invariant. Broadcast V and save the value for future uses. 2425 Value *B = getBroadcastInstrs(V); 2426 VectorLoopValueMap.setVectorValue(V, Part, B); 2427 return B; 2428 } 2429 2430 Value * 2431 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2432 const VPIteration &Instance) { 2433 // If the value is not an instruction contained in the loop, it should 2434 // already be scalar. 2435 if (OrigLoop->isLoopInvariant(V)) 2436 return V; 2437 2438 assert(Instance.Lane > 0 2439 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2440 : true && "Uniform values only have lane zero"); 2441 2442 // If the value from the original loop has not been vectorized, it is 2443 // represented by UF x VF scalar values in the new loop. Return the requested 2444 // scalar value. 2445 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2446 return VectorLoopValueMap.getScalarValue(V, Instance); 2447 2448 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2449 // for the given unroll part. If this entry is not a vector type (i.e., the 2450 // vectorization factor is one), there is no need to generate an 2451 // extractelement instruction. 2452 auto *U = getOrCreateVectorValue(V, Instance.Part); 2453 if (!U->getType()->isVectorTy()) { 2454 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2455 return U; 2456 } 2457 2458 // Otherwise, the value from the original loop has been vectorized and is 2459 // represented by UF vector values. Extract and return the requested scalar 2460 // value from the appropriate vector lane. 2461 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2462 } 2463 2464 void InnerLoopVectorizer::packScalarIntoVectorValue( 2465 Value *V, const VPIteration &Instance) { 2466 assert(V != Induction && "The new induction variable should not be used."); 2467 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2468 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2469 2470 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2471 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2472 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2473 Builder.getInt32(Instance.Lane)); 2474 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2475 } 2476 2477 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2478 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2479 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2480 SmallVector<int, 8> ShuffleMask; 2481 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2482 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2483 2484 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2485 } 2486 2487 // Return whether we allow using masked interleave-groups (for dealing with 2488 // strided loads/stores that reside in predicated blocks, or for dealing 2489 // with gaps). 2490 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2491 // If an override option has been passed in for interleaved accesses, use it. 2492 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2493 return EnableMaskedInterleavedMemAccesses; 2494 2495 return TTI.enableMaskedInterleavedAccessVectorization(); 2496 } 2497 2498 // Try to vectorize the interleave group that \p Instr belongs to. 2499 // 2500 // E.g. Translate following interleaved load group (factor = 3): 2501 // for (i = 0; i < N; i+=3) { 2502 // R = Pic[i]; // Member of index 0 2503 // G = Pic[i+1]; // Member of index 1 2504 // B = Pic[i+2]; // Member of index 2 2505 // ... // do something to R, G, B 2506 // } 2507 // To: 2508 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2509 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2510 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2511 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2512 // 2513 // Or translate following interleaved store group (factor = 3): 2514 // for (i = 0; i < N; i+=3) { 2515 // ... do something to R, G, B 2516 // Pic[i] = R; // Member of index 0 2517 // Pic[i+1] = G; // Member of index 1 2518 // Pic[i+2] = B; // Member of index 2 2519 // } 2520 // To: 2521 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2522 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2523 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2524 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2525 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2526 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2527 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2528 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2529 VPValue *BlockInMask) { 2530 Instruction *Instr = Group->getInsertPos(); 2531 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2532 2533 // Prepare for the vector type of the interleaved load/store. 2534 Type *ScalarTy = getMemInstValueType(Instr); 2535 unsigned InterleaveFactor = Group->getFactor(); 2536 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2537 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2538 2539 // Prepare for the new pointers. 2540 SmallVector<Value *, 2> AddrParts; 2541 unsigned Index = Group->getIndex(Instr); 2542 2543 // TODO: extend the masked interleaved-group support to reversed access. 2544 assert((!BlockInMask || !Group->isReverse()) && 2545 "Reversed masked interleave-group not supported."); 2546 2547 // If the group is reverse, adjust the index to refer to the last vector lane 2548 // instead of the first. We adjust the index from the first vector lane, 2549 // rather than directly getting the pointer for lane VF - 1, because the 2550 // pointer operand of the interleaved access is supposed to be uniform. For 2551 // uniform instructions, we're only required to generate a value for the 2552 // first vector lane in each unroll iteration. 2553 assert(!VF.isScalable() && 2554 "scalable vector reverse operation is not implemented"); 2555 if (Group->isReverse()) 2556 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2557 2558 for (unsigned Part = 0; Part < UF; Part++) { 2559 Value *AddrPart = State.get(Addr, {Part, 0}); 2560 setDebugLocFromInst(Builder, AddrPart); 2561 2562 // Notice current instruction could be any index. Need to adjust the address 2563 // to the member of index 0. 2564 // 2565 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2566 // b = A[i]; // Member of index 0 2567 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2568 // 2569 // E.g. A[i+1] = a; // Member of index 1 2570 // A[i] = b; // Member of index 0 2571 // A[i+2] = c; // Member of index 2 (Current instruction) 2572 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2573 2574 bool InBounds = false; 2575 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2576 InBounds = gep->isInBounds(); 2577 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2578 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2579 2580 // Cast to the vector pointer type. 2581 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2582 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2583 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2584 } 2585 2586 setDebugLocFromInst(Builder, Instr); 2587 Value *PoisonVec = PoisonValue::get(VecTy); 2588 2589 Value *MaskForGaps = nullptr; 2590 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2591 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2592 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2593 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2594 } 2595 2596 // Vectorize the interleaved load group. 2597 if (isa<LoadInst>(Instr)) { 2598 // For each unroll part, create a wide load for the group. 2599 SmallVector<Value *, 2> NewLoads; 2600 for (unsigned Part = 0; Part < UF; Part++) { 2601 Instruction *NewLoad; 2602 if (BlockInMask || MaskForGaps) { 2603 assert(useMaskedInterleavedAccesses(*TTI) && 2604 "masked interleaved groups are not allowed."); 2605 Value *GroupMask = MaskForGaps; 2606 if (BlockInMask) { 2607 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2608 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2609 Value *ShuffledMask = Builder.CreateShuffleVector( 2610 BlockInMaskPart, 2611 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2612 "interleaved.mask"); 2613 GroupMask = MaskForGaps 2614 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2615 MaskForGaps) 2616 : ShuffledMask; 2617 } 2618 NewLoad = 2619 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2620 GroupMask, PoisonVec, "wide.masked.vec"); 2621 } 2622 else 2623 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2624 Group->getAlign(), "wide.vec"); 2625 Group->addMetadata(NewLoad); 2626 NewLoads.push_back(NewLoad); 2627 } 2628 2629 // For each member in the group, shuffle out the appropriate data from the 2630 // wide loads. 2631 unsigned J = 0; 2632 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2633 Instruction *Member = Group->getMember(I); 2634 2635 // Skip the gaps in the group. 2636 if (!Member) 2637 continue; 2638 2639 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2640 auto StrideMask = 2641 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2642 for (unsigned Part = 0; Part < UF; Part++) { 2643 Value *StridedVec = Builder.CreateShuffleVector( 2644 NewLoads[Part], StrideMask, "strided.vec"); 2645 2646 // If this member has different type, cast the result type. 2647 if (Member->getType() != ScalarTy) { 2648 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2649 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2650 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2651 } 2652 2653 if (Group->isReverse()) 2654 StridedVec = reverseVector(StridedVec); 2655 2656 State.set(VPDefs[J], Member, StridedVec, Part); 2657 } 2658 ++J; 2659 } 2660 return; 2661 } 2662 2663 // The sub vector type for current instruction. 2664 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2665 auto *SubVT = VectorType::get(ScalarTy, VF); 2666 2667 // Vectorize the interleaved store group. 2668 for (unsigned Part = 0; Part < UF; Part++) { 2669 // Collect the stored vector from each member. 2670 SmallVector<Value *, 4> StoredVecs; 2671 for (unsigned i = 0; i < InterleaveFactor; i++) { 2672 // Interleaved store group doesn't allow a gap, so each index has a member 2673 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2674 2675 Value *StoredVec = State.get(StoredValues[i], Part); 2676 2677 if (Group->isReverse()) 2678 StoredVec = reverseVector(StoredVec); 2679 2680 // If this member has different type, cast it to a unified type. 2681 2682 if (StoredVec->getType() != SubVT) 2683 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2684 2685 StoredVecs.push_back(StoredVec); 2686 } 2687 2688 // Concatenate all vectors into a wide vector. 2689 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2690 2691 // Interleave the elements in the wide vector. 2692 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2693 Value *IVec = Builder.CreateShuffleVector( 2694 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2695 "interleaved.vec"); 2696 2697 Instruction *NewStoreInstr; 2698 if (BlockInMask) { 2699 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2700 Value *ShuffledMask = Builder.CreateShuffleVector( 2701 BlockInMaskPart, 2702 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2703 "interleaved.mask"); 2704 NewStoreInstr = Builder.CreateMaskedStore( 2705 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2706 } 2707 else 2708 NewStoreInstr = 2709 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2710 2711 Group->addMetadata(NewStoreInstr); 2712 } 2713 } 2714 2715 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2716 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2717 VPValue *StoredValue, VPValue *BlockInMask) { 2718 // Attempt to issue a wide load. 2719 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2720 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2721 2722 assert((LI || SI) && "Invalid Load/Store instruction"); 2723 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2724 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2725 2726 LoopVectorizationCostModel::InstWidening Decision = 2727 Cost->getWideningDecision(Instr, VF); 2728 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2729 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2730 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2731 "CM decision is not to widen the memory instruction"); 2732 2733 Type *ScalarDataTy = getMemInstValueType(Instr); 2734 2735 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2736 const Align Alignment = getLoadStoreAlignment(Instr); 2737 2738 // Determine if the pointer operand of the access is either consecutive or 2739 // reverse consecutive. 2740 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2741 bool ConsecutiveStride = 2742 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2743 bool CreateGatherScatter = 2744 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2745 2746 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2747 // gather/scatter. Otherwise Decision should have been to Scalarize. 2748 assert((ConsecutiveStride || CreateGatherScatter) && 2749 "The instruction should be scalarized"); 2750 (void)ConsecutiveStride; 2751 2752 VectorParts BlockInMaskParts(UF); 2753 bool isMaskRequired = BlockInMask; 2754 if (isMaskRequired) 2755 for (unsigned Part = 0; Part < UF; ++Part) 2756 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2757 2758 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2759 // Calculate the pointer for the specific unroll-part. 2760 GetElementPtrInst *PartPtr = nullptr; 2761 2762 bool InBounds = false; 2763 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2764 InBounds = gep->isInBounds(); 2765 2766 if (Reverse) { 2767 assert(!VF.isScalable() && 2768 "Reversing vectors is not yet supported for scalable vectors."); 2769 2770 // If the address is consecutive but reversed, then the 2771 // wide store needs to start at the last vector element. 2772 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2773 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2774 PartPtr->setIsInBounds(InBounds); 2775 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2776 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2777 PartPtr->setIsInBounds(InBounds); 2778 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2779 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2780 } else { 2781 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2782 PartPtr = cast<GetElementPtrInst>( 2783 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2784 PartPtr->setIsInBounds(InBounds); 2785 } 2786 2787 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2788 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2789 }; 2790 2791 // Handle Stores: 2792 if (SI) { 2793 setDebugLocFromInst(Builder, SI); 2794 2795 for (unsigned Part = 0; Part < UF; ++Part) { 2796 Instruction *NewSI = nullptr; 2797 Value *StoredVal = State.get(StoredValue, Part); 2798 if (CreateGatherScatter) { 2799 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2800 Value *VectorGep = State.get(Addr, Part); 2801 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2802 MaskPart); 2803 } else { 2804 if (Reverse) { 2805 // If we store to reverse consecutive memory locations, then we need 2806 // to reverse the order of elements in the stored value. 2807 StoredVal = reverseVector(StoredVal); 2808 // We don't want to update the value in the map as it might be used in 2809 // another expression. So don't call resetVectorValue(StoredVal). 2810 } 2811 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2812 if (isMaskRequired) 2813 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2814 BlockInMaskParts[Part]); 2815 else 2816 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2817 } 2818 addMetadata(NewSI, SI); 2819 } 2820 return; 2821 } 2822 2823 // Handle loads. 2824 assert(LI && "Must have a load instruction"); 2825 setDebugLocFromInst(Builder, LI); 2826 for (unsigned Part = 0; Part < UF; ++Part) { 2827 Value *NewLI; 2828 if (CreateGatherScatter) { 2829 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2830 Value *VectorGep = State.get(Addr, Part); 2831 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2832 nullptr, "wide.masked.gather"); 2833 addMetadata(NewLI, LI); 2834 } else { 2835 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2836 if (isMaskRequired) 2837 NewLI = Builder.CreateMaskedLoad( 2838 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2839 "wide.masked.load"); 2840 else 2841 NewLI = 2842 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2843 2844 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2845 addMetadata(NewLI, LI); 2846 if (Reverse) 2847 NewLI = reverseVector(NewLI); 2848 } 2849 2850 State.set(Def, Instr, NewLI, Part); 2851 } 2852 } 2853 2854 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2855 const VPIteration &Instance, 2856 bool IfPredicateInstr, 2857 VPTransformState &State) { 2858 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2859 2860 setDebugLocFromInst(Builder, Instr); 2861 2862 // Does this instruction return a value ? 2863 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2864 2865 Instruction *Cloned = Instr->clone(); 2866 if (!IsVoidRetTy) 2867 Cloned->setName(Instr->getName() + ".cloned"); 2868 2869 // Replace the operands of the cloned instructions with their scalar 2870 // equivalents in the new loop. 2871 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2872 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2873 auto InputInstance = Instance; 2874 if (!Operand || !OrigLoop->contains(Operand) || 2875 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2876 InputInstance.Lane = 0; 2877 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2878 Cloned->setOperand(op, NewOp); 2879 } 2880 addNewMetadata(Cloned, Instr); 2881 2882 // Place the cloned scalar in the new loop. 2883 Builder.Insert(Cloned); 2884 2885 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2886 // representing scalar values in VPTransformState. Add the cloned scalar to 2887 // the scalar map entry. 2888 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2889 2890 // If we just cloned a new assumption, add it the assumption cache. 2891 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2892 if (II->getIntrinsicID() == Intrinsic::assume) 2893 AC->registerAssumption(II); 2894 2895 // End if-block. 2896 if (IfPredicateInstr) 2897 PredicatedInstructions.push_back(Cloned); 2898 } 2899 2900 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2901 Value *End, Value *Step, 2902 Instruction *DL) { 2903 BasicBlock *Header = L->getHeader(); 2904 BasicBlock *Latch = L->getLoopLatch(); 2905 // As we're just creating this loop, it's possible no latch exists 2906 // yet. If so, use the header as this will be a single block loop. 2907 if (!Latch) 2908 Latch = Header; 2909 2910 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2911 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2912 setDebugLocFromInst(Builder, OldInst); 2913 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2914 2915 Builder.SetInsertPoint(Latch->getTerminator()); 2916 setDebugLocFromInst(Builder, OldInst); 2917 2918 // Create i+1 and fill the PHINode. 2919 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2920 Induction->addIncoming(Start, L->getLoopPreheader()); 2921 Induction->addIncoming(Next, Latch); 2922 // Create the compare. 2923 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2924 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2925 2926 // Now we have two terminators. Remove the old one from the block. 2927 Latch->getTerminator()->eraseFromParent(); 2928 2929 return Induction; 2930 } 2931 2932 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2933 if (TripCount) 2934 return TripCount; 2935 2936 assert(L && "Create Trip Count for null loop."); 2937 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2938 // Find the loop boundaries. 2939 ScalarEvolution *SE = PSE.getSE(); 2940 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2941 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2942 "Invalid loop count"); 2943 2944 Type *IdxTy = Legal->getWidestInductionType(); 2945 assert(IdxTy && "No type for induction"); 2946 2947 // The exit count might have the type of i64 while the phi is i32. This can 2948 // happen if we have an induction variable that is sign extended before the 2949 // compare. The only way that we get a backedge taken count is that the 2950 // induction variable was signed and as such will not overflow. In such a case 2951 // truncation is legal. 2952 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2953 IdxTy->getPrimitiveSizeInBits()) 2954 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2955 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2956 2957 // Get the total trip count from the count by adding 1. 2958 const SCEV *ExitCount = SE->getAddExpr( 2959 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2960 2961 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2962 2963 // Expand the trip count and place the new instructions in the preheader. 2964 // Notice that the pre-header does not change, only the loop body. 2965 SCEVExpander Exp(*SE, DL, "induction"); 2966 2967 // Count holds the overall loop count (N). 2968 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2969 L->getLoopPreheader()->getTerminator()); 2970 2971 if (TripCount->getType()->isPointerTy()) 2972 TripCount = 2973 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2974 L->getLoopPreheader()->getTerminator()); 2975 2976 return TripCount; 2977 } 2978 2979 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2980 if (VectorTripCount) 2981 return VectorTripCount; 2982 2983 Value *TC = getOrCreateTripCount(L); 2984 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2985 2986 Type *Ty = TC->getType(); 2987 // This is where we can make the step a runtime constant. 2988 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2989 2990 // If the tail is to be folded by masking, round the number of iterations N 2991 // up to a multiple of Step instead of rounding down. This is done by first 2992 // adding Step-1 and then rounding down. Note that it's ok if this addition 2993 // overflows: the vector induction variable will eventually wrap to zero given 2994 // that it starts at zero and its Step is a power of two; the loop will then 2995 // exit, with the last early-exit vector comparison also producing all-true. 2996 if (Cost->foldTailByMasking()) { 2997 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2998 "VF*UF must be a power of 2 when folding tail by masking"); 2999 assert(!VF.isScalable() && 3000 "Tail folding not yet supported for scalable vectors"); 3001 TC = Builder.CreateAdd( 3002 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3003 } 3004 3005 // Now we need to generate the expression for the part of the loop that the 3006 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3007 // iterations are not required for correctness, or N - Step, otherwise. Step 3008 // is equal to the vectorization factor (number of SIMD elements) times the 3009 // unroll factor (number of SIMD instructions). 3010 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3011 3012 // There are two cases where we need to ensure (at least) the last iteration 3013 // runs in the scalar remainder loop. Thus, if the step evenly divides 3014 // the trip count, we set the remainder to be equal to the step. If the step 3015 // does not evenly divide the trip count, no adjustment is necessary since 3016 // there will already be scalar iterations. Note that the minimum iterations 3017 // check ensures that N >= Step. The cases are: 3018 // 1) If there is a non-reversed interleaved group that may speculatively 3019 // access memory out-of-bounds. 3020 // 2) If any instruction may follow a conditionally taken exit. That is, if 3021 // the loop contains multiple exiting blocks, or a single exiting block 3022 // which is not the latch. 3023 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3024 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3025 R = Builder.CreateSelect(IsZero, Step, R); 3026 } 3027 3028 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3029 3030 return VectorTripCount; 3031 } 3032 3033 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3034 const DataLayout &DL) { 3035 // Verify that V is a vector type with same number of elements as DstVTy. 3036 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3037 unsigned VF = DstFVTy->getNumElements(); 3038 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3039 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3040 Type *SrcElemTy = SrcVecTy->getElementType(); 3041 Type *DstElemTy = DstFVTy->getElementType(); 3042 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3043 "Vector elements must have same size"); 3044 3045 // Do a direct cast if element types are castable. 3046 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3047 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3048 } 3049 // V cannot be directly casted to desired vector type. 3050 // May happen when V is a floating point vector but DstVTy is a vector of 3051 // pointers or vice-versa. Handle this using a two-step bitcast using an 3052 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3053 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3054 "Only one type should be a pointer type"); 3055 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3056 "Only one type should be a floating point type"); 3057 Type *IntTy = 3058 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3059 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3060 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3061 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3062 } 3063 3064 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3065 BasicBlock *Bypass) { 3066 Value *Count = getOrCreateTripCount(L); 3067 // Reuse existing vector loop preheader for TC checks. 3068 // Note that new preheader block is generated for vector loop. 3069 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3070 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3071 3072 // Generate code to check if the loop's trip count is less than VF * UF, or 3073 // equal to it in case a scalar epilogue is required; this implies that the 3074 // vector trip count is zero. This check also covers the case where adding one 3075 // to the backedge-taken count overflowed leading to an incorrect trip count 3076 // of zero. In this case we will also jump to the scalar loop. 3077 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3078 : ICmpInst::ICMP_ULT; 3079 3080 // If tail is to be folded, vector loop takes care of all iterations. 3081 Value *CheckMinIters = Builder.getFalse(); 3082 if (!Cost->foldTailByMasking()) { 3083 Value *Step = 3084 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3085 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3086 } 3087 // Create new preheader for vector loop. 3088 LoopVectorPreHeader = 3089 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3090 "vector.ph"); 3091 3092 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3093 DT->getNode(Bypass)->getIDom()) && 3094 "TC check is expected to dominate Bypass"); 3095 3096 // Update dominator for Bypass & LoopExit. 3097 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3098 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3099 3100 ReplaceInstWithInst( 3101 TCCheckBlock->getTerminator(), 3102 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3103 LoopBypassBlocks.push_back(TCCheckBlock); 3104 } 3105 3106 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3107 // Reuse existing vector loop preheader for SCEV checks. 3108 // Note that new preheader block is generated for vector loop. 3109 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3110 3111 // Generate the code to check that the SCEV assumptions that we made. 3112 // We want the new basic block to start at the first instruction in a 3113 // sequence of instructions that form a check. 3114 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3115 "scev.check"); 3116 Value *SCEVCheck = Exp.expandCodeForPredicate( 3117 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3118 3119 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3120 if (C->isZero()) 3121 return; 3122 3123 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3124 (OptForSizeBasedOnProfile && 3125 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3126 "Cannot SCEV check stride or overflow when optimizing for size"); 3127 3128 SCEVCheckBlock->setName("vector.scevcheck"); 3129 // Create new preheader for vector loop. 3130 LoopVectorPreHeader = 3131 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3132 nullptr, "vector.ph"); 3133 3134 // Update dominator only if this is first RT check. 3135 if (LoopBypassBlocks.empty()) { 3136 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3137 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3138 } 3139 3140 ReplaceInstWithInst( 3141 SCEVCheckBlock->getTerminator(), 3142 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3143 LoopBypassBlocks.push_back(SCEVCheckBlock); 3144 AddedSafetyChecks = true; 3145 } 3146 3147 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3148 // VPlan-native path does not do any analysis for runtime checks currently. 3149 if (EnableVPlanNativePath) 3150 return; 3151 3152 // Reuse existing vector loop preheader for runtime memory checks. 3153 // Note that new preheader block is generated for vector loop. 3154 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3155 3156 // Generate the code that checks in runtime if arrays overlap. We put the 3157 // checks into a separate block to make the more common case of few elements 3158 // faster. 3159 auto *LAI = Legal->getLAI(); 3160 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3161 if (!RtPtrChecking.Need) 3162 return; 3163 3164 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3165 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3166 "Cannot emit memory checks when optimizing for size, unless forced " 3167 "to vectorize."); 3168 ORE->emit([&]() { 3169 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3170 L->getStartLoc(), L->getHeader()) 3171 << "Code-size may be reduced by not forcing " 3172 "vectorization, or by source-code modifications " 3173 "eliminating the need for runtime checks " 3174 "(e.g., adding 'restrict')."; 3175 }); 3176 } 3177 3178 MemCheckBlock->setName("vector.memcheck"); 3179 // Create new preheader for vector loop. 3180 LoopVectorPreHeader = 3181 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3182 "vector.ph"); 3183 3184 auto *CondBranch = cast<BranchInst>( 3185 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3186 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3187 LoopBypassBlocks.push_back(MemCheckBlock); 3188 AddedSafetyChecks = true; 3189 3190 // Update dominator only if this is first RT check. 3191 if (LoopBypassBlocks.empty()) { 3192 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3193 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3194 } 3195 3196 Instruction *FirstCheckInst; 3197 Instruction *MemRuntimeCheck; 3198 std::tie(FirstCheckInst, MemRuntimeCheck) = 3199 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3200 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3201 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3202 "claimed checks are required"); 3203 CondBranch->setCondition(MemRuntimeCheck); 3204 3205 // We currently don't use LoopVersioning for the actual loop cloning but we 3206 // still use it to add the noalias metadata. 3207 LVer = std::make_unique<LoopVersioning>( 3208 *Legal->getLAI(), 3209 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3210 DT, PSE.getSE()); 3211 LVer->prepareNoAliasMetadata(); 3212 } 3213 3214 Value *InnerLoopVectorizer::emitTransformedIndex( 3215 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3216 const InductionDescriptor &ID) const { 3217 3218 SCEVExpander Exp(*SE, DL, "induction"); 3219 auto Step = ID.getStep(); 3220 auto StartValue = ID.getStartValue(); 3221 assert(Index->getType() == Step->getType() && 3222 "Index type does not match StepValue type"); 3223 3224 // Note: the IR at this point is broken. We cannot use SE to create any new 3225 // SCEV and then expand it, hoping that SCEV's simplification will give us 3226 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3227 // lead to various SCEV crashes. So all we can do is to use builder and rely 3228 // on InstCombine for future simplifications. Here we handle some trivial 3229 // cases only. 3230 auto CreateAdd = [&B](Value *X, Value *Y) { 3231 assert(X->getType() == Y->getType() && "Types don't match!"); 3232 if (auto *CX = dyn_cast<ConstantInt>(X)) 3233 if (CX->isZero()) 3234 return Y; 3235 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3236 if (CY->isZero()) 3237 return X; 3238 return B.CreateAdd(X, Y); 3239 }; 3240 3241 auto CreateMul = [&B](Value *X, Value *Y) { 3242 assert(X->getType() == Y->getType() && "Types don't match!"); 3243 if (auto *CX = dyn_cast<ConstantInt>(X)) 3244 if (CX->isOne()) 3245 return Y; 3246 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3247 if (CY->isOne()) 3248 return X; 3249 return B.CreateMul(X, Y); 3250 }; 3251 3252 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3253 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3254 // the DomTree is not kept up-to-date for additional blocks generated in the 3255 // vector loop. By using the header as insertion point, we guarantee that the 3256 // expanded instructions dominate all their uses. 3257 auto GetInsertPoint = [this, &B]() { 3258 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3259 if (InsertBB != LoopVectorBody && 3260 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3261 return LoopVectorBody->getTerminator(); 3262 return &*B.GetInsertPoint(); 3263 }; 3264 switch (ID.getKind()) { 3265 case InductionDescriptor::IK_IntInduction: { 3266 assert(Index->getType() == StartValue->getType() && 3267 "Index type does not match StartValue type"); 3268 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3269 return B.CreateSub(StartValue, Index); 3270 auto *Offset = CreateMul( 3271 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3272 return CreateAdd(StartValue, Offset); 3273 } 3274 case InductionDescriptor::IK_PtrInduction: { 3275 assert(isa<SCEVConstant>(Step) && 3276 "Expected constant step for pointer induction"); 3277 return B.CreateGEP( 3278 StartValue->getType()->getPointerElementType(), StartValue, 3279 CreateMul(Index, 3280 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3281 } 3282 case InductionDescriptor::IK_FpInduction: { 3283 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3284 auto InductionBinOp = ID.getInductionBinOp(); 3285 assert(InductionBinOp && 3286 (InductionBinOp->getOpcode() == Instruction::FAdd || 3287 InductionBinOp->getOpcode() == Instruction::FSub) && 3288 "Original bin op should be defined for FP induction"); 3289 3290 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3291 3292 // Floating point operations had to be 'fast' to enable the induction. 3293 FastMathFlags Flags; 3294 Flags.setFast(); 3295 3296 Value *MulExp = B.CreateFMul(StepValue, Index); 3297 if (isa<Instruction>(MulExp)) 3298 // We have to check, the MulExp may be a constant. 3299 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3300 3301 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3302 "induction"); 3303 if (isa<Instruction>(BOp)) 3304 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3305 3306 return BOp; 3307 } 3308 case InductionDescriptor::IK_NoInduction: 3309 return nullptr; 3310 } 3311 llvm_unreachable("invalid enum"); 3312 } 3313 3314 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3315 LoopScalarBody = OrigLoop->getHeader(); 3316 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3317 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3318 assert(LoopExitBlock && "Must have an exit block"); 3319 assert(LoopVectorPreHeader && "Invalid loop structure"); 3320 3321 LoopMiddleBlock = 3322 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3323 LI, nullptr, Twine(Prefix) + "middle.block"); 3324 LoopScalarPreHeader = 3325 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3326 nullptr, Twine(Prefix) + "scalar.ph"); 3327 3328 // Set up branch from middle block to the exit and scalar preheader blocks. 3329 // completeLoopSkeleton will update the condition to use an iteration check, 3330 // if required to decide whether to execute the remainder. 3331 BranchInst *BrInst = 3332 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3333 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3334 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3335 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3336 3337 // We intentionally don't let SplitBlock to update LoopInfo since 3338 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3339 // LoopVectorBody is explicitly added to the correct place few lines later. 3340 LoopVectorBody = 3341 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3342 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3343 3344 // Update dominator for loop exit. 3345 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3346 3347 // Create and register the new vector loop. 3348 Loop *Lp = LI->AllocateLoop(); 3349 Loop *ParentLoop = OrigLoop->getParentLoop(); 3350 3351 // Insert the new loop into the loop nest and register the new basic blocks 3352 // before calling any utilities such as SCEV that require valid LoopInfo. 3353 if (ParentLoop) { 3354 ParentLoop->addChildLoop(Lp); 3355 } else { 3356 LI->addTopLevelLoop(Lp); 3357 } 3358 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3359 return Lp; 3360 } 3361 3362 void InnerLoopVectorizer::createInductionResumeValues( 3363 Loop *L, Value *VectorTripCount, 3364 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3365 assert(VectorTripCount && L && "Expected valid arguments"); 3366 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3367 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3368 "Inconsistent information about additional bypass."); 3369 // We are going to resume the execution of the scalar loop. 3370 // Go over all of the induction variables that we found and fix the 3371 // PHIs that are left in the scalar version of the loop. 3372 // The starting values of PHI nodes depend on the counter of the last 3373 // iteration in the vectorized loop. 3374 // If we come from a bypass edge then we need to start from the original 3375 // start value. 3376 for (auto &InductionEntry : Legal->getInductionVars()) { 3377 PHINode *OrigPhi = InductionEntry.first; 3378 InductionDescriptor II = InductionEntry.second; 3379 3380 // Create phi nodes to merge from the backedge-taken check block. 3381 PHINode *BCResumeVal = 3382 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3383 LoopScalarPreHeader->getTerminator()); 3384 // Copy original phi DL over to the new one. 3385 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3386 Value *&EndValue = IVEndValues[OrigPhi]; 3387 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3388 if (OrigPhi == OldInduction) { 3389 // We know what the end value is. 3390 EndValue = VectorTripCount; 3391 } else { 3392 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3393 Type *StepType = II.getStep()->getType(); 3394 Instruction::CastOps CastOp = 3395 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3396 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3397 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3398 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3399 EndValue->setName("ind.end"); 3400 3401 // Compute the end value for the additional bypass (if applicable). 3402 if (AdditionalBypass.first) { 3403 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3404 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3405 StepType, true); 3406 CRD = 3407 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3408 EndValueFromAdditionalBypass = 3409 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3410 EndValueFromAdditionalBypass->setName("ind.end"); 3411 } 3412 } 3413 // The new PHI merges the original incoming value, in case of a bypass, 3414 // or the value at the end of the vectorized loop. 3415 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3416 3417 // Fix the scalar body counter (PHI node). 3418 // The old induction's phi node in the scalar body needs the truncated 3419 // value. 3420 for (BasicBlock *BB : LoopBypassBlocks) 3421 BCResumeVal->addIncoming(II.getStartValue(), BB); 3422 3423 if (AdditionalBypass.first) 3424 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3425 EndValueFromAdditionalBypass); 3426 3427 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3428 } 3429 } 3430 3431 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3432 MDNode *OrigLoopID) { 3433 assert(L && "Expected valid loop."); 3434 3435 // The trip counts should be cached by now. 3436 Value *Count = getOrCreateTripCount(L); 3437 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3438 3439 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3440 3441 // Add a check in the middle block to see if we have completed 3442 // all of the iterations in the first vector loop. 3443 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3444 // If tail is to be folded, we know we don't need to run the remainder. 3445 if (!Cost->foldTailByMasking()) { 3446 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3447 Count, VectorTripCount, "cmp.n", 3448 LoopMiddleBlock->getTerminator()); 3449 3450 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3451 // of the corresponding compare because they may have ended up with 3452 // different line numbers and we want to avoid awkward line stepping while 3453 // debugging. Eg. if the compare has got a line number inside the loop. 3454 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3455 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3456 } 3457 3458 // Get ready to start creating new instructions into the vectorized body. 3459 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3460 "Inconsistent vector loop preheader"); 3461 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3462 3463 Optional<MDNode *> VectorizedLoopID = 3464 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3465 LLVMLoopVectorizeFollowupVectorized}); 3466 if (VectorizedLoopID.hasValue()) { 3467 L->setLoopID(VectorizedLoopID.getValue()); 3468 3469 // Do not setAlreadyVectorized if loop attributes have been defined 3470 // explicitly. 3471 return LoopVectorPreHeader; 3472 } 3473 3474 // Keep all loop hints from the original loop on the vector loop (we'll 3475 // replace the vectorizer-specific hints below). 3476 if (MDNode *LID = OrigLoop->getLoopID()) 3477 L->setLoopID(LID); 3478 3479 LoopVectorizeHints Hints(L, true, *ORE); 3480 Hints.setAlreadyVectorized(); 3481 3482 #ifdef EXPENSIVE_CHECKS 3483 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3484 LI->verify(*DT); 3485 #endif 3486 3487 return LoopVectorPreHeader; 3488 } 3489 3490 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3491 /* 3492 In this function we generate a new loop. The new loop will contain 3493 the vectorized instructions while the old loop will continue to run the 3494 scalar remainder. 3495 3496 [ ] <-- loop iteration number check. 3497 / | 3498 / v 3499 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3500 | / | 3501 | / v 3502 || [ ] <-- vector pre header. 3503 |/ | 3504 | v 3505 | [ ] \ 3506 | [ ]_| <-- vector loop. 3507 | | 3508 | v 3509 | -[ ] <--- middle-block. 3510 | / | 3511 | / v 3512 -|- >[ ] <--- new preheader. 3513 | | 3514 | v 3515 | [ ] \ 3516 | [ ]_| <-- old scalar loop to handle remainder. 3517 \ | 3518 \ v 3519 >[ ] <-- exit block. 3520 ... 3521 */ 3522 3523 // Get the metadata of the original loop before it gets modified. 3524 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3525 3526 // Create an empty vector loop, and prepare basic blocks for the runtime 3527 // checks. 3528 Loop *Lp = createVectorLoopSkeleton(""); 3529 3530 // Now, compare the new count to zero. If it is zero skip the vector loop and 3531 // jump to the scalar loop. This check also covers the case where the 3532 // backedge-taken count is uint##_max: adding one to it will overflow leading 3533 // to an incorrect trip count of zero. In this (rare) case we will also jump 3534 // to the scalar loop. 3535 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3536 3537 // Generate the code to check any assumptions that we've made for SCEV 3538 // expressions. 3539 emitSCEVChecks(Lp, LoopScalarPreHeader); 3540 3541 // Generate the code that checks in runtime if arrays overlap. We put the 3542 // checks into a separate block to make the more common case of few elements 3543 // faster. 3544 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3545 3546 // Some loops have a single integer induction variable, while other loops 3547 // don't. One example is c++ iterators that often have multiple pointer 3548 // induction variables. In the code below we also support a case where we 3549 // don't have a single induction variable. 3550 // 3551 // We try to obtain an induction variable from the original loop as hard 3552 // as possible. However if we don't find one that: 3553 // - is an integer 3554 // - counts from zero, stepping by one 3555 // - is the size of the widest induction variable type 3556 // then we create a new one. 3557 OldInduction = Legal->getPrimaryInduction(); 3558 Type *IdxTy = Legal->getWidestInductionType(); 3559 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3560 // The loop step is equal to the vectorization factor (num of SIMD elements) 3561 // times the unroll factor (num of SIMD instructions). 3562 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3563 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3564 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3565 Induction = 3566 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3567 getDebugLocFromInstOrOperands(OldInduction)); 3568 3569 // Emit phis for the new starting index of the scalar loop. 3570 createInductionResumeValues(Lp, CountRoundDown); 3571 3572 return completeLoopSkeleton(Lp, OrigLoopID); 3573 } 3574 3575 // Fix up external users of the induction variable. At this point, we are 3576 // in LCSSA form, with all external PHIs that use the IV having one input value, 3577 // coming from the remainder loop. We need those PHIs to also have a correct 3578 // value for the IV when arriving directly from the middle block. 3579 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3580 const InductionDescriptor &II, 3581 Value *CountRoundDown, Value *EndValue, 3582 BasicBlock *MiddleBlock) { 3583 // There are two kinds of external IV usages - those that use the value 3584 // computed in the last iteration (the PHI) and those that use the penultimate 3585 // value (the value that feeds into the phi from the loop latch). 3586 // We allow both, but they, obviously, have different values. 3587 3588 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3589 3590 DenseMap<Value *, Value *> MissingVals; 3591 3592 // An external user of the last iteration's value should see the value that 3593 // the remainder loop uses to initialize its own IV. 3594 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3595 for (User *U : PostInc->users()) { 3596 Instruction *UI = cast<Instruction>(U); 3597 if (!OrigLoop->contains(UI)) { 3598 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3599 MissingVals[UI] = EndValue; 3600 } 3601 } 3602 3603 // An external user of the penultimate value need to see EndValue - Step. 3604 // The simplest way to get this is to recompute it from the constituent SCEVs, 3605 // that is Start + (Step * (CRD - 1)). 3606 for (User *U : OrigPhi->users()) { 3607 auto *UI = cast<Instruction>(U); 3608 if (!OrigLoop->contains(UI)) { 3609 const DataLayout &DL = 3610 OrigLoop->getHeader()->getModule()->getDataLayout(); 3611 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3612 3613 IRBuilder<> B(MiddleBlock->getTerminator()); 3614 Value *CountMinusOne = B.CreateSub( 3615 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3616 Value *CMO = 3617 !II.getStep()->getType()->isIntegerTy() 3618 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3619 II.getStep()->getType()) 3620 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3621 CMO->setName("cast.cmo"); 3622 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3623 Escape->setName("ind.escape"); 3624 MissingVals[UI] = Escape; 3625 } 3626 } 3627 3628 for (auto &I : MissingVals) { 3629 PHINode *PHI = cast<PHINode>(I.first); 3630 // One corner case we have to handle is two IVs "chasing" each-other, 3631 // that is %IV2 = phi [...], [ %IV1, %latch ] 3632 // In this case, if IV1 has an external use, we need to avoid adding both 3633 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3634 // don't already have an incoming value for the middle block. 3635 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3636 PHI->addIncoming(I.second, MiddleBlock); 3637 } 3638 } 3639 3640 namespace { 3641 3642 struct CSEDenseMapInfo { 3643 static bool canHandle(const Instruction *I) { 3644 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3645 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3646 } 3647 3648 static inline Instruction *getEmptyKey() { 3649 return DenseMapInfo<Instruction *>::getEmptyKey(); 3650 } 3651 3652 static inline Instruction *getTombstoneKey() { 3653 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3654 } 3655 3656 static unsigned getHashValue(const Instruction *I) { 3657 assert(canHandle(I) && "Unknown instruction!"); 3658 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3659 I->value_op_end())); 3660 } 3661 3662 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3663 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3664 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3665 return LHS == RHS; 3666 return LHS->isIdenticalTo(RHS); 3667 } 3668 }; 3669 3670 } // end anonymous namespace 3671 3672 ///Perform cse of induction variable instructions. 3673 static void cse(BasicBlock *BB) { 3674 // Perform simple cse. 3675 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3676 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3677 Instruction *In = &*I++; 3678 3679 if (!CSEDenseMapInfo::canHandle(In)) 3680 continue; 3681 3682 // Check if we can replace this instruction with any of the 3683 // visited instructions. 3684 if (Instruction *V = CSEMap.lookup(In)) { 3685 In->replaceAllUsesWith(V); 3686 In->eraseFromParent(); 3687 continue; 3688 } 3689 3690 CSEMap[In] = In; 3691 } 3692 } 3693 3694 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3695 ElementCount VF, 3696 bool &NeedToScalarize) { 3697 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3698 Function *F = CI->getCalledFunction(); 3699 Type *ScalarRetTy = CI->getType(); 3700 SmallVector<Type *, 4> Tys, ScalarTys; 3701 for (auto &ArgOp : CI->arg_operands()) 3702 ScalarTys.push_back(ArgOp->getType()); 3703 3704 // Estimate cost of scalarized vector call. The source operands are assumed 3705 // to be vectors, so we need to extract individual elements from there, 3706 // execute VF scalar calls, and then gather the result into the vector return 3707 // value. 3708 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3709 TTI::TCK_RecipThroughput); 3710 if (VF.isScalar()) 3711 return ScalarCallCost; 3712 3713 // Compute corresponding vector type for return value and arguments. 3714 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3715 for (Type *ScalarTy : ScalarTys) 3716 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3717 3718 // Compute costs of unpacking argument values for the scalar calls and 3719 // packing the return values to a vector. 3720 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3721 3722 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3723 3724 // If we can't emit a vector call for this function, then the currently found 3725 // cost is the cost we need to return. 3726 NeedToScalarize = true; 3727 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3728 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3729 3730 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3731 return Cost; 3732 3733 // If the corresponding vector cost is cheaper, return its cost. 3734 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3735 TTI::TCK_RecipThroughput); 3736 if (VectorCallCost < Cost) { 3737 NeedToScalarize = false; 3738 return VectorCallCost; 3739 } 3740 return Cost; 3741 } 3742 3743 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3744 ElementCount VF) { 3745 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3746 assert(ID && "Expected intrinsic call!"); 3747 3748 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3749 return TTI.getIntrinsicInstrCost(CostAttrs, 3750 TargetTransformInfo::TCK_RecipThroughput); 3751 } 3752 3753 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3754 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3755 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3756 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3757 } 3758 3759 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3760 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3761 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3762 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3763 } 3764 3765 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3766 // For every instruction `I` in MinBWs, truncate the operands, create a 3767 // truncated version of `I` and reextend its result. InstCombine runs 3768 // later and will remove any ext/trunc pairs. 3769 SmallPtrSet<Value *, 4> Erased; 3770 for (const auto &KV : Cost->getMinimalBitwidths()) { 3771 // If the value wasn't vectorized, we must maintain the original scalar 3772 // type. The absence of the value from VectorLoopValueMap indicates that it 3773 // wasn't vectorized. 3774 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3775 continue; 3776 for (unsigned Part = 0; Part < UF; ++Part) { 3777 Value *I = getOrCreateVectorValue(KV.first, Part); 3778 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3779 continue; 3780 Type *OriginalTy = I->getType(); 3781 Type *ScalarTruncatedTy = 3782 IntegerType::get(OriginalTy->getContext(), KV.second); 3783 auto *TruncatedTy = FixedVectorType::get( 3784 ScalarTruncatedTy, 3785 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3786 if (TruncatedTy == OriginalTy) 3787 continue; 3788 3789 IRBuilder<> B(cast<Instruction>(I)); 3790 auto ShrinkOperand = [&](Value *V) -> Value * { 3791 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3792 if (ZI->getSrcTy() == TruncatedTy) 3793 return ZI->getOperand(0); 3794 return B.CreateZExtOrTrunc(V, TruncatedTy); 3795 }; 3796 3797 // The actual instruction modification depends on the instruction type, 3798 // unfortunately. 3799 Value *NewI = nullptr; 3800 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3801 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3802 ShrinkOperand(BO->getOperand(1))); 3803 3804 // Any wrapping introduced by shrinking this operation shouldn't be 3805 // considered undefined behavior. So, we can't unconditionally copy 3806 // arithmetic wrapping flags to NewI. 3807 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3808 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3809 NewI = 3810 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3811 ShrinkOperand(CI->getOperand(1))); 3812 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3813 NewI = B.CreateSelect(SI->getCondition(), 3814 ShrinkOperand(SI->getTrueValue()), 3815 ShrinkOperand(SI->getFalseValue())); 3816 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3817 switch (CI->getOpcode()) { 3818 default: 3819 llvm_unreachable("Unhandled cast!"); 3820 case Instruction::Trunc: 3821 NewI = ShrinkOperand(CI->getOperand(0)); 3822 break; 3823 case Instruction::SExt: 3824 NewI = B.CreateSExtOrTrunc( 3825 CI->getOperand(0), 3826 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3827 break; 3828 case Instruction::ZExt: 3829 NewI = B.CreateZExtOrTrunc( 3830 CI->getOperand(0), 3831 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3832 break; 3833 } 3834 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3835 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3836 ->getNumElements(); 3837 auto *O0 = B.CreateZExtOrTrunc( 3838 SI->getOperand(0), 3839 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3840 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3841 ->getNumElements(); 3842 auto *O1 = B.CreateZExtOrTrunc( 3843 SI->getOperand(1), 3844 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3845 3846 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3847 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3848 // Don't do anything with the operands, just extend the result. 3849 continue; 3850 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3851 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3852 ->getNumElements(); 3853 auto *O0 = B.CreateZExtOrTrunc( 3854 IE->getOperand(0), 3855 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3856 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3857 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3858 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3859 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3860 ->getNumElements(); 3861 auto *O0 = B.CreateZExtOrTrunc( 3862 EE->getOperand(0), 3863 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3864 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3865 } else { 3866 // If we don't know what to do, be conservative and don't do anything. 3867 continue; 3868 } 3869 3870 // Lastly, extend the result. 3871 NewI->takeName(cast<Instruction>(I)); 3872 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3873 I->replaceAllUsesWith(Res); 3874 cast<Instruction>(I)->eraseFromParent(); 3875 Erased.insert(I); 3876 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3877 } 3878 } 3879 3880 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3881 for (const auto &KV : Cost->getMinimalBitwidths()) { 3882 // If the value wasn't vectorized, we must maintain the original scalar 3883 // type. The absence of the value from VectorLoopValueMap indicates that it 3884 // wasn't vectorized. 3885 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3886 continue; 3887 for (unsigned Part = 0; Part < UF; ++Part) { 3888 Value *I = getOrCreateVectorValue(KV.first, Part); 3889 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3890 if (Inst && Inst->use_empty()) { 3891 Value *NewI = Inst->getOperand(0); 3892 Inst->eraseFromParent(); 3893 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3894 } 3895 } 3896 } 3897 } 3898 3899 void InnerLoopVectorizer::fixVectorizedLoop() { 3900 // Insert truncates and extends for any truncated instructions as hints to 3901 // InstCombine. 3902 if (VF.isVector()) 3903 truncateToMinimalBitwidths(); 3904 3905 // Fix widened non-induction PHIs by setting up the PHI operands. 3906 if (OrigPHIsToFix.size()) { 3907 assert(EnableVPlanNativePath && 3908 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3909 fixNonInductionPHIs(); 3910 } 3911 3912 // At this point every instruction in the original loop is widened to a 3913 // vector form. Now we need to fix the recurrences in the loop. These PHI 3914 // nodes are currently empty because we did not want to introduce cycles. 3915 // This is the second stage of vectorizing recurrences. 3916 fixCrossIterationPHIs(); 3917 3918 // Forget the original basic block. 3919 PSE.getSE()->forgetLoop(OrigLoop); 3920 3921 // Fix-up external users of the induction variables. 3922 for (auto &Entry : Legal->getInductionVars()) 3923 fixupIVUsers(Entry.first, Entry.second, 3924 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3925 IVEndValues[Entry.first], LoopMiddleBlock); 3926 3927 fixLCSSAPHIs(); 3928 for (Instruction *PI : PredicatedInstructions) 3929 sinkScalarOperands(&*PI); 3930 3931 // Remove redundant induction instructions. 3932 cse(LoopVectorBody); 3933 3934 // Set/update profile weights for the vector and remainder loops as original 3935 // loop iterations are now distributed among them. Note that original loop 3936 // represented by LoopScalarBody becomes remainder loop after vectorization. 3937 // 3938 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3939 // end up getting slightly roughened result but that should be OK since 3940 // profile is not inherently precise anyway. Note also possible bypass of 3941 // vector code caused by legality checks is ignored, assigning all the weight 3942 // to the vector loop, optimistically. 3943 // 3944 // For scalable vectorization we can't know at compile time how many iterations 3945 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3946 // vscale of '1'. 3947 setProfileInfoAfterUnrolling( 3948 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3949 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3950 } 3951 3952 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3953 // In order to support recurrences we need to be able to vectorize Phi nodes. 3954 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3955 // stage #2: We now need to fix the recurrences by adding incoming edges to 3956 // the currently empty PHI nodes. At this point every instruction in the 3957 // original loop is widened to a vector form so we can use them to construct 3958 // the incoming edges. 3959 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3960 // Handle first-order recurrences and reductions that need to be fixed. 3961 if (Legal->isFirstOrderRecurrence(&Phi)) 3962 fixFirstOrderRecurrence(&Phi); 3963 else if (Legal->isReductionVariable(&Phi)) 3964 fixReduction(&Phi); 3965 } 3966 } 3967 3968 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3969 // This is the second phase of vectorizing first-order recurrences. An 3970 // overview of the transformation is described below. Suppose we have the 3971 // following loop. 3972 // 3973 // for (int i = 0; i < n; ++i) 3974 // b[i] = a[i] - a[i - 1]; 3975 // 3976 // There is a first-order recurrence on "a". For this loop, the shorthand 3977 // scalar IR looks like: 3978 // 3979 // scalar.ph: 3980 // s_init = a[-1] 3981 // br scalar.body 3982 // 3983 // scalar.body: 3984 // i = phi [0, scalar.ph], [i+1, scalar.body] 3985 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3986 // s2 = a[i] 3987 // b[i] = s2 - s1 3988 // br cond, scalar.body, ... 3989 // 3990 // In this example, s1 is a recurrence because it's value depends on the 3991 // previous iteration. In the first phase of vectorization, we created a 3992 // temporary value for s1. We now complete the vectorization and produce the 3993 // shorthand vector IR shown below (for VF = 4, UF = 1). 3994 // 3995 // vector.ph: 3996 // v_init = vector(..., ..., ..., a[-1]) 3997 // br vector.body 3998 // 3999 // vector.body 4000 // i = phi [0, vector.ph], [i+4, vector.body] 4001 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4002 // v2 = a[i, i+1, i+2, i+3]; 4003 // v3 = vector(v1(3), v2(0, 1, 2)) 4004 // b[i, i+1, i+2, i+3] = v2 - v3 4005 // br cond, vector.body, middle.block 4006 // 4007 // middle.block: 4008 // x = v2(3) 4009 // br scalar.ph 4010 // 4011 // scalar.ph: 4012 // s_init = phi [x, middle.block], [a[-1], otherwise] 4013 // br scalar.body 4014 // 4015 // After execution completes the vector loop, we extract the next value of 4016 // the recurrence (x) to use as the initial value in the scalar loop. 4017 4018 // Get the original loop preheader and single loop latch. 4019 auto *Preheader = OrigLoop->getLoopPreheader(); 4020 auto *Latch = OrigLoop->getLoopLatch(); 4021 4022 // Get the initial and previous values of the scalar recurrence. 4023 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4024 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4025 4026 // Create a vector from the initial value. 4027 auto *VectorInit = ScalarInit; 4028 if (VF.isVector()) { 4029 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4030 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4031 VectorInit = Builder.CreateInsertElement( 4032 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4033 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4034 } 4035 4036 // We constructed a temporary phi node in the first phase of vectorization. 4037 // This phi node will eventually be deleted. 4038 Builder.SetInsertPoint( 4039 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4040 4041 // Create a phi node for the new recurrence. The current value will either be 4042 // the initial value inserted into a vector or loop-varying vector value. 4043 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4044 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4045 4046 // Get the vectorized previous value of the last part UF - 1. It appears last 4047 // among all unrolled iterations, due to the order of their construction. 4048 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4049 4050 // Find and set the insertion point after the previous value if it is an 4051 // instruction. 4052 BasicBlock::iterator InsertPt; 4053 // Note that the previous value may have been constant-folded so it is not 4054 // guaranteed to be an instruction in the vector loop. 4055 // FIXME: Loop invariant values do not form recurrences. We should deal with 4056 // them earlier. 4057 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4058 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4059 else { 4060 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4061 if (isa<PHINode>(PreviousLastPart)) 4062 // If the previous value is a phi node, we should insert after all the phi 4063 // nodes in the block containing the PHI to avoid breaking basic block 4064 // verification. Note that the basic block may be different to 4065 // LoopVectorBody, in case we predicate the loop. 4066 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4067 else 4068 InsertPt = ++PreviousInst->getIterator(); 4069 } 4070 Builder.SetInsertPoint(&*InsertPt); 4071 4072 // We will construct a vector for the recurrence by combining the values for 4073 // the current and previous iterations. This is the required shuffle mask. 4074 assert(!VF.isScalable()); 4075 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4076 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4077 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4078 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4079 4080 // The vector from which to take the initial value for the current iteration 4081 // (actual or unrolled). Initially, this is the vector phi node. 4082 Value *Incoming = VecPhi; 4083 4084 // Shuffle the current and previous vector and update the vector parts. 4085 for (unsigned Part = 0; Part < UF; ++Part) { 4086 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4087 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4088 auto *Shuffle = 4089 VF.isVector() 4090 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4091 : Incoming; 4092 PhiPart->replaceAllUsesWith(Shuffle); 4093 cast<Instruction>(PhiPart)->eraseFromParent(); 4094 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4095 Incoming = PreviousPart; 4096 } 4097 4098 // Fix the latch value of the new recurrence in the vector loop. 4099 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4100 4101 // Extract the last vector element in the middle block. This will be the 4102 // initial value for the recurrence when jumping to the scalar loop. 4103 auto *ExtractForScalar = Incoming; 4104 if (VF.isVector()) { 4105 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4106 ExtractForScalar = Builder.CreateExtractElement( 4107 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4108 "vector.recur.extract"); 4109 } 4110 // Extract the second last element in the middle block if the 4111 // Phi is used outside the loop. We need to extract the phi itself 4112 // and not the last element (the phi update in the current iteration). This 4113 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4114 // when the scalar loop is not run at all. 4115 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4116 if (VF.isVector()) 4117 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4118 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4119 "vector.recur.extract.for.phi"); 4120 // When loop is unrolled without vectorizing, initialize 4121 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4122 // `Incoming`. This is analogous to the vectorized case above: extracting the 4123 // second last element when VF > 1. 4124 else if (UF > 1) 4125 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4126 4127 // Fix the initial value of the original recurrence in the scalar loop. 4128 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4129 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4130 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4131 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4132 Start->addIncoming(Incoming, BB); 4133 } 4134 4135 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4136 Phi->setName("scalar.recur"); 4137 4138 // Finally, fix users of the recurrence outside the loop. The users will need 4139 // either the last value of the scalar recurrence or the last value of the 4140 // vector recurrence we extracted in the middle block. Since the loop is in 4141 // LCSSA form, we just need to find all the phi nodes for the original scalar 4142 // recurrence in the exit block, and then add an edge for the middle block. 4143 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4144 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4145 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4146 } 4147 } 4148 } 4149 4150 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4151 Constant *Zero = Builder.getInt32(0); 4152 4153 // Get it's reduction variable descriptor. 4154 assert(Legal->isReductionVariable(Phi) && 4155 "Unable to find the reduction variable"); 4156 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4157 4158 RecurKind RK = RdxDesc.getRecurrenceKind(); 4159 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4160 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4161 setDebugLocFromInst(Builder, ReductionStartValue); 4162 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4163 4164 // We need to generate a reduction vector from the incoming scalar. 4165 // To do so, we need to generate the 'identity' vector and override 4166 // one of the elements with the incoming scalar reduction. We need 4167 // to do it in the vector-loop preheader. 4168 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4169 4170 // This is the vector-clone of the value that leaves the loop. 4171 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4172 4173 // Find the reduction identity variable. Zero for addition, or, xor, 4174 // one for multiplication, -1 for And. 4175 Value *Identity; 4176 Value *VectorStart; 4177 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4178 // MinMax reduction have the start value as their identify. 4179 if (VF.isScalar() || IsInLoopReductionPhi) { 4180 VectorStart = Identity = ReductionStartValue; 4181 } else { 4182 VectorStart = Identity = 4183 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4184 } 4185 } else { 4186 // Handle other reduction kinds: 4187 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4188 RK, VecTy->getScalarType()); 4189 if (VF.isScalar() || IsInLoopReductionPhi) { 4190 Identity = Iden; 4191 // This vector is the Identity vector where the first element is the 4192 // incoming scalar reduction. 4193 VectorStart = ReductionStartValue; 4194 } else { 4195 Identity = ConstantVector::getSplat(VF, Iden); 4196 4197 // This vector is the Identity vector where the first element is the 4198 // incoming scalar reduction. 4199 VectorStart = 4200 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4201 } 4202 } 4203 4204 // Wrap flags are in general invalid after vectorization, clear them. 4205 clearReductionWrapFlags(RdxDesc); 4206 4207 // Fix the vector-loop phi. 4208 4209 // Reductions do not have to start at zero. They can start with 4210 // any loop invariant values. 4211 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4212 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4213 4214 for (unsigned Part = 0; Part < UF; ++Part) { 4215 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4216 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4217 // Make sure to add the reduction start value only to the 4218 // first unroll part. 4219 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4220 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4221 cast<PHINode>(VecRdxPhi) 4222 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4223 } 4224 4225 // Before each round, move the insertion point right between 4226 // the PHIs and the values we are going to write. 4227 // This allows us to write both PHINodes and the extractelement 4228 // instructions. 4229 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4230 4231 setDebugLocFromInst(Builder, LoopExitInst); 4232 4233 // If tail is folded by masking, the vector value to leave the loop should be 4234 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4235 // instead of the former. For an inloop reduction the reduction will already 4236 // be predicated, and does not need to be handled here. 4237 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4238 for (unsigned Part = 0; Part < UF; ++Part) { 4239 Value *VecLoopExitInst = 4240 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4241 Value *Sel = nullptr; 4242 for (User *U : VecLoopExitInst->users()) { 4243 if (isa<SelectInst>(U)) { 4244 assert(!Sel && "Reduction exit feeding two selects"); 4245 Sel = U; 4246 } else 4247 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4248 } 4249 assert(Sel && "Reduction exit feeds no select"); 4250 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4251 4252 // If the target can create a predicated operator for the reduction at no 4253 // extra cost in the loop (for example a predicated vadd), it can be 4254 // cheaper for the select to remain in the loop than be sunk out of it, 4255 // and so use the select value for the phi instead of the old 4256 // LoopExitValue. 4257 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4258 if (PreferPredicatedReductionSelect || 4259 TTI->preferPredicatedReductionSelect( 4260 RdxDesc.getOpcode(), Phi->getType(), 4261 TargetTransformInfo::ReductionFlags())) { 4262 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4263 VecRdxPhi->setIncomingValueForBlock( 4264 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4265 } 4266 } 4267 } 4268 4269 // If the vector reduction can be performed in a smaller type, we truncate 4270 // then extend the loop exit value to enable InstCombine to evaluate the 4271 // entire expression in the smaller type. 4272 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4273 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4274 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4275 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4276 Builder.SetInsertPoint( 4277 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4278 VectorParts RdxParts(UF); 4279 for (unsigned Part = 0; Part < UF; ++Part) { 4280 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4281 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4282 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4283 : Builder.CreateZExt(Trunc, VecTy); 4284 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4285 UI != RdxParts[Part]->user_end();) 4286 if (*UI != Trunc) { 4287 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4288 RdxParts[Part] = Extnd; 4289 } else { 4290 ++UI; 4291 } 4292 } 4293 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4294 for (unsigned Part = 0; Part < UF; ++Part) { 4295 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4296 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4297 } 4298 } 4299 4300 // Reduce all of the unrolled parts into a single vector. 4301 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4302 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4303 4304 // The middle block terminator has already been assigned a DebugLoc here (the 4305 // OrigLoop's single latch terminator). We want the whole middle block to 4306 // appear to execute on this line because: (a) it is all compiler generated, 4307 // (b) these instructions are always executed after evaluating the latch 4308 // conditional branch, and (c) other passes may add new predecessors which 4309 // terminate on this line. This is the easiest way to ensure we don't 4310 // accidentally cause an extra step back into the loop while debugging. 4311 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4312 for (unsigned Part = 1; Part < UF; ++Part) { 4313 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4314 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4315 // Floating point operations had to be 'fast' to enable the reduction. 4316 ReducedPartRdx = addFastMathFlag( 4317 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4318 ReducedPartRdx, "bin.rdx"), 4319 RdxDesc.getFastMathFlags()); 4320 else 4321 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4322 } 4323 4324 // Create the reduction after the loop. Note that inloop reductions create the 4325 // target reduction in the loop using a Reduction recipe. 4326 if (VF.isVector() && !IsInLoopReductionPhi) { 4327 ReducedPartRdx = 4328 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4329 // If the reduction can be performed in a smaller type, we need to extend 4330 // the reduction to the wider type before we branch to the original loop. 4331 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4332 ReducedPartRdx = 4333 RdxDesc.isSigned() 4334 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4335 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4336 } 4337 4338 // Create a phi node that merges control-flow from the backedge-taken check 4339 // block and the middle block. 4340 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4341 LoopScalarPreHeader->getTerminator()); 4342 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4343 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4344 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4345 4346 // Now, we need to fix the users of the reduction variable 4347 // inside and outside of the scalar remainder loop. 4348 // We know that the loop is in LCSSA form. We need to update the 4349 // PHI nodes in the exit blocks. 4350 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4351 // All PHINodes need to have a single entry edge, or two if 4352 // we already fixed them. 4353 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4354 4355 // We found a reduction value exit-PHI. Update it with the 4356 // incoming bypass edge. 4357 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4358 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4359 } // end of the LCSSA phi scan. 4360 4361 // Fix the scalar loop reduction variable with the incoming reduction sum 4362 // from the vector body and from the backedge value. 4363 int IncomingEdgeBlockIdx = 4364 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4365 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4366 // Pick the other block. 4367 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4368 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4369 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4370 } 4371 4372 void InnerLoopVectorizer::clearReductionWrapFlags( 4373 RecurrenceDescriptor &RdxDesc) { 4374 RecurKind RK = RdxDesc.getRecurrenceKind(); 4375 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4376 return; 4377 4378 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4379 assert(LoopExitInstr && "null loop exit instruction"); 4380 SmallVector<Instruction *, 8> Worklist; 4381 SmallPtrSet<Instruction *, 8> Visited; 4382 Worklist.push_back(LoopExitInstr); 4383 Visited.insert(LoopExitInstr); 4384 4385 while (!Worklist.empty()) { 4386 Instruction *Cur = Worklist.pop_back_val(); 4387 if (isa<OverflowingBinaryOperator>(Cur)) 4388 for (unsigned Part = 0; Part < UF; ++Part) { 4389 Value *V = getOrCreateVectorValue(Cur, Part); 4390 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4391 } 4392 4393 for (User *U : Cur->users()) { 4394 Instruction *UI = cast<Instruction>(U); 4395 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4396 Visited.insert(UI).second) 4397 Worklist.push_back(UI); 4398 } 4399 } 4400 } 4401 4402 void InnerLoopVectorizer::fixLCSSAPHIs() { 4403 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4404 if (LCSSAPhi.getNumIncomingValues() == 1) { 4405 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4406 // Non-instruction incoming values will have only one value. 4407 unsigned LastLane = 0; 4408 if (isa<Instruction>(IncomingValue)) 4409 LastLane = Cost->isUniformAfterVectorization( 4410 cast<Instruction>(IncomingValue), VF) 4411 ? 0 4412 : VF.getKnownMinValue() - 1; 4413 assert((!VF.isScalable() || LastLane == 0) && 4414 "scalable vectors dont support non-uniform scalars yet"); 4415 // Can be a loop invariant incoming value or the last scalar value to be 4416 // extracted from the vectorized loop. 4417 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4418 Value *lastIncomingValue = 4419 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4420 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4421 } 4422 } 4423 } 4424 4425 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4426 // The basic block and loop containing the predicated instruction. 4427 auto *PredBB = PredInst->getParent(); 4428 auto *VectorLoop = LI->getLoopFor(PredBB); 4429 4430 // Initialize a worklist with the operands of the predicated instruction. 4431 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4432 4433 // Holds instructions that we need to analyze again. An instruction may be 4434 // reanalyzed if we don't yet know if we can sink it or not. 4435 SmallVector<Instruction *, 8> InstsToReanalyze; 4436 4437 // Returns true if a given use occurs in the predicated block. Phi nodes use 4438 // their operands in their corresponding predecessor blocks. 4439 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4440 auto *I = cast<Instruction>(U.getUser()); 4441 BasicBlock *BB = I->getParent(); 4442 if (auto *Phi = dyn_cast<PHINode>(I)) 4443 BB = Phi->getIncomingBlock( 4444 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4445 return BB == PredBB; 4446 }; 4447 4448 // Iteratively sink the scalarized operands of the predicated instruction 4449 // into the block we created for it. When an instruction is sunk, it's 4450 // operands are then added to the worklist. The algorithm ends after one pass 4451 // through the worklist doesn't sink a single instruction. 4452 bool Changed; 4453 do { 4454 // Add the instructions that need to be reanalyzed to the worklist, and 4455 // reset the changed indicator. 4456 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4457 InstsToReanalyze.clear(); 4458 Changed = false; 4459 4460 while (!Worklist.empty()) { 4461 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4462 4463 // We can't sink an instruction if it is a phi node, is already in the 4464 // predicated block, is not in the loop, or may have side effects. 4465 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4466 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4467 continue; 4468 4469 // It's legal to sink the instruction if all its uses occur in the 4470 // predicated block. Otherwise, there's nothing to do yet, and we may 4471 // need to reanalyze the instruction. 4472 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4473 InstsToReanalyze.push_back(I); 4474 continue; 4475 } 4476 4477 // Move the instruction to the beginning of the predicated block, and add 4478 // it's operands to the worklist. 4479 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4480 Worklist.insert(I->op_begin(), I->op_end()); 4481 4482 // The sinking may have enabled other instructions to be sunk, so we will 4483 // need to iterate. 4484 Changed = true; 4485 } 4486 } while (Changed); 4487 } 4488 4489 void InnerLoopVectorizer::fixNonInductionPHIs() { 4490 for (PHINode *OrigPhi : OrigPHIsToFix) { 4491 PHINode *NewPhi = 4492 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4493 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4494 4495 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4496 predecessors(OrigPhi->getParent())); 4497 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4498 predecessors(NewPhi->getParent())); 4499 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4500 "Scalar and Vector BB should have the same number of predecessors"); 4501 4502 // The insertion point in Builder may be invalidated by the time we get 4503 // here. Force the Builder insertion point to something valid so that we do 4504 // not run into issues during insertion point restore in 4505 // getOrCreateVectorValue calls below. 4506 Builder.SetInsertPoint(NewPhi); 4507 4508 // The predecessor order is preserved and we can rely on mapping between 4509 // scalar and vector block predecessors. 4510 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4511 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4512 4513 // When looking up the new scalar/vector values to fix up, use incoming 4514 // values from original phi. 4515 Value *ScIncV = 4516 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4517 4518 // Scalar incoming value may need a broadcast 4519 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4520 NewPhi->addIncoming(NewIncV, NewPredBB); 4521 } 4522 } 4523 } 4524 4525 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4526 VPUser &Operands, unsigned UF, 4527 ElementCount VF, bool IsPtrLoopInvariant, 4528 SmallBitVector &IsIndexLoopInvariant, 4529 VPTransformState &State) { 4530 // Construct a vector GEP by widening the operands of the scalar GEP as 4531 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4532 // results in a vector of pointers when at least one operand of the GEP 4533 // is vector-typed. Thus, to keep the representation compact, we only use 4534 // vector-typed operands for loop-varying values. 4535 4536 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4537 // If we are vectorizing, but the GEP has only loop-invariant operands, 4538 // the GEP we build (by only using vector-typed operands for 4539 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4540 // produce a vector of pointers, we need to either arbitrarily pick an 4541 // operand to broadcast, or broadcast a clone of the original GEP. 4542 // Here, we broadcast a clone of the original. 4543 // 4544 // TODO: If at some point we decide to scalarize instructions having 4545 // loop-invariant operands, this special case will no longer be 4546 // required. We would add the scalarization decision to 4547 // collectLoopScalars() and teach getVectorValue() to broadcast 4548 // the lane-zero scalar value. 4549 auto *Clone = Builder.Insert(GEP->clone()); 4550 for (unsigned Part = 0; Part < UF; ++Part) { 4551 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4552 State.set(VPDef, GEP, EntryPart, Part); 4553 addMetadata(EntryPart, GEP); 4554 } 4555 } else { 4556 // If the GEP has at least one loop-varying operand, we are sure to 4557 // produce a vector of pointers. But if we are only unrolling, we want 4558 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4559 // produce with the code below will be scalar (if VF == 1) or vector 4560 // (otherwise). Note that for the unroll-only case, we still maintain 4561 // values in the vector mapping with initVector, as we do for other 4562 // instructions. 4563 for (unsigned Part = 0; Part < UF; ++Part) { 4564 // The pointer operand of the new GEP. If it's loop-invariant, we 4565 // won't broadcast it. 4566 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4567 : State.get(Operands.getOperand(0), Part); 4568 4569 // Collect all the indices for the new GEP. If any index is 4570 // loop-invariant, we won't broadcast it. 4571 SmallVector<Value *, 4> Indices; 4572 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4573 VPValue *Operand = Operands.getOperand(I); 4574 if (IsIndexLoopInvariant[I - 1]) 4575 Indices.push_back(State.get(Operand, {0, 0})); 4576 else 4577 Indices.push_back(State.get(Operand, Part)); 4578 } 4579 4580 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4581 // but it should be a vector, otherwise. 4582 auto *NewGEP = 4583 GEP->isInBounds() 4584 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4585 Indices) 4586 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4587 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4588 "NewGEP is not a pointer vector"); 4589 State.set(VPDef, GEP, NewGEP, Part); 4590 addMetadata(NewGEP, GEP); 4591 } 4592 } 4593 } 4594 4595 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4596 ElementCount VF) { 4597 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4598 PHINode *P = cast<PHINode>(PN); 4599 if (EnableVPlanNativePath) { 4600 // Currently we enter here in the VPlan-native path for non-induction 4601 // PHIs where all control flow is uniform. We simply widen these PHIs. 4602 // Create a vector phi with no operands - the vector phi operands will be 4603 // set at the end of vector code generation. 4604 Type *VecTy = 4605 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4606 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4607 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4608 OrigPHIsToFix.push_back(P); 4609 4610 return; 4611 } 4612 4613 assert(PN->getParent() == OrigLoop->getHeader() && 4614 "Non-header phis should have been handled elsewhere"); 4615 4616 // In order to support recurrences we need to be able to vectorize Phi nodes. 4617 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4618 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4619 // this value when we vectorize all of the instructions that use the PHI. 4620 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4621 for (unsigned Part = 0; Part < UF; ++Part) { 4622 // This is phase one of vectorizing PHIs. 4623 bool ScalarPHI = 4624 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4625 Type *VecTy = 4626 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4627 Value *EntryPart = PHINode::Create( 4628 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4629 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4630 } 4631 return; 4632 } 4633 4634 setDebugLocFromInst(Builder, P); 4635 4636 // This PHINode must be an induction variable. 4637 // Make sure that we know about it. 4638 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4639 4640 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4641 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4642 4643 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4644 // which can be found from the original scalar operations. 4645 switch (II.getKind()) { 4646 case InductionDescriptor::IK_NoInduction: 4647 llvm_unreachable("Unknown induction"); 4648 case InductionDescriptor::IK_IntInduction: 4649 case InductionDescriptor::IK_FpInduction: 4650 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4651 case InductionDescriptor::IK_PtrInduction: { 4652 // Handle the pointer induction variable case. 4653 assert(P->getType()->isPointerTy() && "Unexpected type."); 4654 4655 if (Cost->isScalarAfterVectorization(P, VF)) { 4656 // This is the normalized GEP that starts counting at zero. 4657 Value *PtrInd = 4658 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4659 // Determine the number of scalars we need to generate for each unroll 4660 // iteration. If the instruction is uniform, we only need to generate the 4661 // first lane. Otherwise, we generate all VF values. 4662 unsigned Lanes = 4663 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4664 for (unsigned Part = 0; Part < UF; ++Part) { 4665 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4666 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4667 Lane + Part * VF.getKnownMinValue()); 4668 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4669 Value *SclrGep = 4670 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4671 SclrGep->setName("next.gep"); 4672 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4673 } 4674 } 4675 return; 4676 } 4677 assert(isa<SCEVConstant>(II.getStep()) && 4678 "Induction step not a SCEV constant!"); 4679 Type *PhiType = II.getStep()->getType(); 4680 4681 // Build a pointer phi 4682 Value *ScalarStartValue = II.getStartValue(); 4683 Type *ScStValueType = ScalarStartValue->getType(); 4684 PHINode *NewPointerPhi = 4685 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4686 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4687 4688 // A pointer induction, performed by using a gep 4689 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4690 Instruction *InductionLoc = LoopLatch->getTerminator(); 4691 const SCEV *ScalarStep = II.getStep(); 4692 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4693 Value *ScalarStepValue = 4694 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4695 Value *InductionGEP = GetElementPtrInst::Create( 4696 ScStValueType->getPointerElementType(), NewPointerPhi, 4697 Builder.CreateMul( 4698 ScalarStepValue, 4699 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4700 "ptr.ind", InductionLoc); 4701 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4702 4703 // Create UF many actual address geps that use the pointer 4704 // phi as base and a vectorized version of the step value 4705 // (<step*0, ..., step*N>) as offset. 4706 for (unsigned Part = 0; Part < UF; ++Part) { 4707 SmallVector<Constant *, 8> Indices; 4708 // Create a vector of consecutive numbers from zero to VF. 4709 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4710 Indices.push_back( 4711 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4712 Constant *StartOffset = ConstantVector::get(Indices); 4713 4714 Value *GEP = Builder.CreateGEP( 4715 ScStValueType->getPointerElementType(), NewPointerPhi, 4716 Builder.CreateMul( 4717 StartOffset, 4718 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4719 "vector.gep")); 4720 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4721 } 4722 } 4723 } 4724 } 4725 4726 /// A helper function for checking whether an integer division-related 4727 /// instruction may divide by zero (in which case it must be predicated if 4728 /// executed conditionally in the scalar code). 4729 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4730 /// Non-zero divisors that are non compile-time constants will not be 4731 /// converted into multiplication, so we will still end up scalarizing 4732 /// the division, but can do so w/o predication. 4733 static bool mayDivideByZero(Instruction &I) { 4734 assert((I.getOpcode() == Instruction::UDiv || 4735 I.getOpcode() == Instruction::SDiv || 4736 I.getOpcode() == Instruction::URem || 4737 I.getOpcode() == Instruction::SRem) && 4738 "Unexpected instruction"); 4739 Value *Divisor = I.getOperand(1); 4740 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4741 return !CInt || CInt->isZero(); 4742 } 4743 4744 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4745 VPUser &User, 4746 VPTransformState &State) { 4747 switch (I.getOpcode()) { 4748 case Instruction::Call: 4749 case Instruction::Br: 4750 case Instruction::PHI: 4751 case Instruction::GetElementPtr: 4752 case Instruction::Select: 4753 llvm_unreachable("This instruction is handled by a different recipe."); 4754 case Instruction::UDiv: 4755 case Instruction::SDiv: 4756 case Instruction::SRem: 4757 case Instruction::URem: 4758 case Instruction::Add: 4759 case Instruction::FAdd: 4760 case Instruction::Sub: 4761 case Instruction::FSub: 4762 case Instruction::FNeg: 4763 case Instruction::Mul: 4764 case Instruction::FMul: 4765 case Instruction::FDiv: 4766 case Instruction::FRem: 4767 case Instruction::Shl: 4768 case Instruction::LShr: 4769 case Instruction::AShr: 4770 case Instruction::And: 4771 case Instruction::Or: 4772 case Instruction::Xor: { 4773 // Just widen unops and binops. 4774 setDebugLocFromInst(Builder, &I); 4775 4776 for (unsigned Part = 0; Part < UF; ++Part) { 4777 SmallVector<Value *, 2> Ops; 4778 for (VPValue *VPOp : User.operands()) 4779 Ops.push_back(State.get(VPOp, Part)); 4780 4781 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4782 4783 if (auto *VecOp = dyn_cast<Instruction>(V)) 4784 VecOp->copyIRFlags(&I); 4785 4786 // Use this vector value for all users of the original instruction. 4787 State.set(Def, &I, V, Part); 4788 addMetadata(V, &I); 4789 } 4790 4791 break; 4792 } 4793 case Instruction::ICmp: 4794 case Instruction::FCmp: { 4795 // Widen compares. Generate vector compares. 4796 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4797 auto *Cmp = cast<CmpInst>(&I); 4798 setDebugLocFromInst(Builder, Cmp); 4799 for (unsigned Part = 0; Part < UF; ++Part) { 4800 Value *A = State.get(User.getOperand(0), Part); 4801 Value *B = State.get(User.getOperand(1), Part); 4802 Value *C = nullptr; 4803 if (FCmp) { 4804 // Propagate fast math flags. 4805 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4806 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4807 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4808 } else { 4809 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4810 } 4811 State.set(Def, &I, C, Part); 4812 addMetadata(C, &I); 4813 } 4814 4815 break; 4816 } 4817 4818 case Instruction::ZExt: 4819 case Instruction::SExt: 4820 case Instruction::FPToUI: 4821 case Instruction::FPToSI: 4822 case Instruction::FPExt: 4823 case Instruction::PtrToInt: 4824 case Instruction::IntToPtr: 4825 case Instruction::SIToFP: 4826 case Instruction::UIToFP: 4827 case Instruction::Trunc: 4828 case Instruction::FPTrunc: 4829 case Instruction::BitCast: { 4830 auto *CI = cast<CastInst>(&I); 4831 setDebugLocFromInst(Builder, CI); 4832 4833 /// Vectorize casts. 4834 Type *DestTy = 4835 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4836 4837 for (unsigned Part = 0; Part < UF; ++Part) { 4838 Value *A = State.get(User.getOperand(0), Part); 4839 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4840 State.set(Def, &I, Cast, Part); 4841 addMetadata(Cast, &I); 4842 } 4843 break; 4844 } 4845 default: 4846 // This instruction is not vectorized by simple widening. 4847 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4848 llvm_unreachable("Unhandled instruction!"); 4849 } // end of switch. 4850 } 4851 4852 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4853 VPUser &ArgOperands, 4854 VPTransformState &State) { 4855 assert(!isa<DbgInfoIntrinsic>(I) && 4856 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4857 setDebugLocFromInst(Builder, &I); 4858 4859 Module *M = I.getParent()->getParent()->getParent(); 4860 auto *CI = cast<CallInst>(&I); 4861 4862 SmallVector<Type *, 4> Tys; 4863 for (Value *ArgOperand : CI->arg_operands()) 4864 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4865 4866 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4867 4868 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4869 // version of the instruction. 4870 // Is it beneficial to perform intrinsic call compared to lib call? 4871 bool NeedToScalarize = false; 4872 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4873 bool UseVectorIntrinsic = 4874 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4875 assert((UseVectorIntrinsic || !NeedToScalarize) && 4876 "Instruction should be scalarized elsewhere."); 4877 4878 for (unsigned Part = 0; Part < UF; ++Part) { 4879 SmallVector<Value *, 4> Args; 4880 for (auto &I : enumerate(ArgOperands.operands())) { 4881 // Some intrinsics have a scalar argument - don't replace it with a 4882 // vector. 4883 Value *Arg; 4884 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4885 Arg = State.get(I.value(), Part); 4886 else 4887 Arg = State.get(I.value(), {0, 0}); 4888 Args.push_back(Arg); 4889 } 4890 4891 Function *VectorF; 4892 if (UseVectorIntrinsic) { 4893 // Use vector version of the intrinsic. 4894 Type *TysForDecl[] = {CI->getType()}; 4895 if (VF.isVector()) { 4896 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4897 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4898 } 4899 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4900 assert(VectorF && "Can't retrieve vector intrinsic."); 4901 } else { 4902 // Use vector version of the function call. 4903 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4904 #ifndef NDEBUG 4905 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4906 "Can't create vector function."); 4907 #endif 4908 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4909 } 4910 SmallVector<OperandBundleDef, 1> OpBundles; 4911 CI->getOperandBundlesAsDefs(OpBundles); 4912 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4913 4914 if (isa<FPMathOperator>(V)) 4915 V->copyFastMathFlags(CI); 4916 4917 State.set(Def, &I, V, Part); 4918 addMetadata(V, &I); 4919 } 4920 } 4921 4922 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4923 VPUser &Operands, 4924 bool InvariantCond, 4925 VPTransformState &State) { 4926 setDebugLocFromInst(Builder, &I); 4927 4928 // The condition can be loop invariant but still defined inside the 4929 // loop. This means that we can't just use the original 'cond' value. 4930 // We have to take the 'vectorized' value and pick the first lane. 4931 // Instcombine will make this a no-op. 4932 auto *InvarCond = 4933 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4934 4935 for (unsigned Part = 0; Part < UF; ++Part) { 4936 Value *Cond = 4937 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4938 Value *Op0 = State.get(Operands.getOperand(1), Part); 4939 Value *Op1 = State.get(Operands.getOperand(2), Part); 4940 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4941 State.set(VPDef, &I, Sel, Part); 4942 addMetadata(Sel, &I); 4943 } 4944 } 4945 4946 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4947 // We should not collect Scalars more than once per VF. Right now, this 4948 // function is called from collectUniformsAndScalars(), which already does 4949 // this check. Collecting Scalars for VF=1 does not make any sense. 4950 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4951 "This function should not be visited twice for the same VF"); 4952 4953 SmallSetVector<Instruction *, 8> Worklist; 4954 4955 // These sets are used to seed the analysis with pointers used by memory 4956 // accesses that will remain scalar. 4957 SmallSetVector<Instruction *, 8> ScalarPtrs; 4958 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4959 auto *Latch = TheLoop->getLoopLatch(); 4960 4961 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4962 // The pointer operands of loads and stores will be scalar as long as the 4963 // memory access is not a gather or scatter operation. The value operand of a 4964 // store will remain scalar if the store is scalarized. 4965 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4966 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4967 assert(WideningDecision != CM_Unknown && 4968 "Widening decision should be ready at this moment"); 4969 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4970 if (Ptr == Store->getValueOperand()) 4971 return WideningDecision == CM_Scalarize; 4972 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4973 "Ptr is neither a value or pointer operand"); 4974 return WideningDecision != CM_GatherScatter; 4975 }; 4976 4977 // A helper that returns true if the given value is a bitcast or 4978 // getelementptr instruction contained in the loop. 4979 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4980 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4981 isa<GetElementPtrInst>(V)) && 4982 !TheLoop->isLoopInvariant(V); 4983 }; 4984 4985 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4986 if (!isa<PHINode>(Ptr) || 4987 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4988 return false; 4989 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4990 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4991 return false; 4992 return isScalarUse(MemAccess, Ptr); 4993 }; 4994 4995 // A helper that evaluates a memory access's use of a pointer. If the 4996 // pointer is actually the pointer induction of a loop, it is being 4997 // inserted into Worklist. If the use will be a scalar use, and the 4998 // pointer is only used by memory accesses, we place the pointer in 4999 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5000 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5001 if (isScalarPtrInduction(MemAccess, Ptr)) { 5002 Worklist.insert(cast<Instruction>(Ptr)); 5003 Instruction *Update = cast<Instruction>( 5004 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5005 Worklist.insert(Update); 5006 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5007 << "\n"); 5008 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5009 << "\n"); 5010 return; 5011 } 5012 // We only care about bitcast and getelementptr instructions contained in 5013 // the loop. 5014 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5015 return; 5016 5017 // If the pointer has already been identified as scalar (e.g., if it was 5018 // also identified as uniform), there's nothing to do. 5019 auto *I = cast<Instruction>(Ptr); 5020 if (Worklist.count(I)) 5021 return; 5022 5023 // If the use of the pointer will be a scalar use, and all users of the 5024 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5025 // place the pointer in PossibleNonScalarPtrs. 5026 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5027 return isa<LoadInst>(U) || isa<StoreInst>(U); 5028 })) 5029 ScalarPtrs.insert(I); 5030 else 5031 PossibleNonScalarPtrs.insert(I); 5032 }; 5033 5034 // We seed the scalars analysis with three classes of instructions: (1) 5035 // instructions marked uniform-after-vectorization and (2) bitcast, 5036 // getelementptr and (pointer) phi instructions used by memory accesses 5037 // requiring a scalar use. 5038 // 5039 // (1) Add to the worklist all instructions that have been identified as 5040 // uniform-after-vectorization. 5041 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5042 5043 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5044 // memory accesses requiring a scalar use. The pointer operands of loads and 5045 // stores will be scalar as long as the memory accesses is not a gather or 5046 // scatter operation. The value operand of a store will remain scalar if the 5047 // store is scalarized. 5048 for (auto *BB : TheLoop->blocks()) 5049 for (auto &I : *BB) { 5050 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5051 evaluatePtrUse(Load, Load->getPointerOperand()); 5052 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5053 evaluatePtrUse(Store, Store->getPointerOperand()); 5054 evaluatePtrUse(Store, Store->getValueOperand()); 5055 } 5056 } 5057 for (auto *I : ScalarPtrs) 5058 if (!PossibleNonScalarPtrs.count(I)) { 5059 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5060 Worklist.insert(I); 5061 } 5062 5063 // Insert the forced scalars. 5064 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5065 // induction variable when the PHI user is scalarized. 5066 auto ForcedScalar = ForcedScalars.find(VF); 5067 if (ForcedScalar != ForcedScalars.end()) 5068 for (auto *I : ForcedScalar->second) 5069 Worklist.insert(I); 5070 5071 // Expand the worklist by looking through any bitcasts and getelementptr 5072 // instructions we've already identified as scalar. This is similar to the 5073 // expansion step in collectLoopUniforms(); however, here we're only 5074 // expanding to include additional bitcasts and getelementptr instructions. 5075 unsigned Idx = 0; 5076 while (Idx != Worklist.size()) { 5077 Instruction *Dst = Worklist[Idx++]; 5078 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5079 continue; 5080 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5081 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5082 auto *J = cast<Instruction>(U); 5083 return !TheLoop->contains(J) || Worklist.count(J) || 5084 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5085 isScalarUse(J, Src)); 5086 })) { 5087 Worklist.insert(Src); 5088 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5089 } 5090 } 5091 5092 // An induction variable will remain scalar if all users of the induction 5093 // variable and induction variable update remain scalar. 5094 for (auto &Induction : Legal->getInductionVars()) { 5095 auto *Ind = Induction.first; 5096 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5097 5098 // If tail-folding is applied, the primary induction variable will be used 5099 // to feed a vector compare. 5100 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5101 continue; 5102 5103 // Determine if all users of the induction variable are scalar after 5104 // vectorization. 5105 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5106 auto *I = cast<Instruction>(U); 5107 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5108 }); 5109 if (!ScalarInd) 5110 continue; 5111 5112 // Determine if all users of the induction variable update instruction are 5113 // scalar after vectorization. 5114 auto ScalarIndUpdate = 5115 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5116 auto *I = cast<Instruction>(U); 5117 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5118 }); 5119 if (!ScalarIndUpdate) 5120 continue; 5121 5122 // The induction variable and its update instruction will remain scalar. 5123 Worklist.insert(Ind); 5124 Worklist.insert(IndUpdate); 5125 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5126 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5127 << "\n"); 5128 } 5129 5130 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5131 } 5132 5133 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5134 ElementCount VF) { 5135 if (!blockNeedsPredication(I->getParent())) 5136 return false; 5137 switch(I->getOpcode()) { 5138 default: 5139 break; 5140 case Instruction::Load: 5141 case Instruction::Store: { 5142 if (!Legal->isMaskRequired(I)) 5143 return false; 5144 auto *Ptr = getLoadStorePointerOperand(I); 5145 auto *Ty = getMemInstValueType(I); 5146 // We have already decided how to vectorize this instruction, get that 5147 // result. 5148 if (VF.isVector()) { 5149 InstWidening WideningDecision = getWideningDecision(I, VF); 5150 assert(WideningDecision != CM_Unknown && 5151 "Widening decision should be ready at this moment"); 5152 return WideningDecision == CM_Scalarize; 5153 } 5154 const Align Alignment = getLoadStoreAlignment(I); 5155 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5156 isLegalMaskedGather(Ty, Alignment)) 5157 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5158 isLegalMaskedScatter(Ty, Alignment)); 5159 } 5160 case Instruction::UDiv: 5161 case Instruction::SDiv: 5162 case Instruction::SRem: 5163 case Instruction::URem: 5164 return mayDivideByZero(*I); 5165 } 5166 return false; 5167 } 5168 5169 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5170 Instruction *I, ElementCount VF) { 5171 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5172 assert(getWideningDecision(I, VF) == CM_Unknown && 5173 "Decision should not be set yet."); 5174 auto *Group = getInterleavedAccessGroup(I); 5175 assert(Group && "Must have a group."); 5176 5177 // If the instruction's allocated size doesn't equal it's type size, it 5178 // requires padding and will be scalarized. 5179 auto &DL = I->getModule()->getDataLayout(); 5180 auto *ScalarTy = getMemInstValueType(I); 5181 if (hasIrregularType(ScalarTy, DL, VF)) 5182 return false; 5183 5184 // Check if masking is required. 5185 // A Group may need masking for one of two reasons: it resides in a block that 5186 // needs predication, or it was decided to use masking to deal with gaps. 5187 bool PredicatedAccessRequiresMasking = 5188 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5189 bool AccessWithGapsRequiresMasking = 5190 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5191 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5192 return true; 5193 5194 // If masked interleaving is required, we expect that the user/target had 5195 // enabled it, because otherwise it either wouldn't have been created or 5196 // it should have been invalidated by the CostModel. 5197 assert(useMaskedInterleavedAccesses(TTI) && 5198 "Masked interleave-groups for predicated accesses are not enabled."); 5199 5200 auto *Ty = getMemInstValueType(I); 5201 const Align Alignment = getLoadStoreAlignment(I); 5202 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5203 : TTI.isLegalMaskedStore(Ty, Alignment); 5204 } 5205 5206 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5207 Instruction *I, ElementCount VF) { 5208 // Get and ensure we have a valid memory instruction. 5209 LoadInst *LI = dyn_cast<LoadInst>(I); 5210 StoreInst *SI = dyn_cast<StoreInst>(I); 5211 assert((LI || SI) && "Invalid memory instruction"); 5212 5213 auto *Ptr = getLoadStorePointerOperand(I); 5214 5215 // In order to be widened, the pointer should be consecutive, first of all. 5216 if (!Legal->isConsecutivePtr(Ptr)) 5217 return false; 5218 5219 // If the instruction is a store located in a predicated block, it will be 5220 // scalarized. 5221 if (isScalarWithPredication(I)) 5222 return false; 5223 5224 // If the instruction's allocated size doesn't equal it's type size, it 5225 // requires padding and will be scalarized. 5226 auto &DL = I->getModule()->getDataLayout(); 5227 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5228 if (hasIrregularType(ScalarTy, DL, VF)) 5229 return false; 5230 5231 return true; 5232 } 5233 5234 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5235 // We should not collect Uniforms more than once per VF. Right now, 5236 // this function is called from collectUniformsAndScalars(), which 5237 // already does this check. Collecting Uniforms for VF=1 does not make any 5238 // sense. 5239 5240 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5241 "This function should not be visited twice for the same VF"); 5242 5243 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5244 // not analyze again. Uniforms.count(VF) will return 1. 5245 Uniforms[VF].clear(); 5246 5247 // We now know that the loop is vectorizable! 5248 // Collect instructions inside the loop that will remain uniform after 5249 // vectorization. 5250 5251 // Global values, params and instructions outside of current loop are out of 5252 // scope. 5253 auto isOutOfScope = [&](Value *V) -> bool { 5254 Instruction *I = dyn_cast<Instruction>(V); 5255 return (!I || !TheLoop->contains(I)); 5256 }; 5257 5258 SetVector<Instruction *> Worklist; 5259 BasicBlock *Latch = TheLoop->getLoopLatch(); 5260 5261 // Instructions that are scalar with predication must not be considered 5262 // uniform after vectorization, because that would create an erroneous 5263 // replicating region where only a single instance out of VF should be formed. 5264 // TODO: optimize such seldom cases if found important, see PR40816. 5265 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5266 if (isOutOfScope(I)) { 5267 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5268 << *I << "\n"); 5269 return; 5270 } 5271 if (isScalarWithPredication(I, VF)) { 5272 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5273 << *I << "\n"); 5274 return; 5275 } 5276 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5277 Worklist.insert(I); 5278 }; 5279 5280 // Start with the conditional branch. If the branch condition is an 5281 // instruction contained in the loop that is only used by the branch, it is 5282 // uniform. 5283 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5284 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5285 addToWorklistIfAllowed(Cmp); 5286 5287 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5288 InstWidening WideningDecision = getWideningDecision(I, VF); 5289 assert(WideningDecision != CM_Unknown && 5290 "Widening decision should be ready at this moment"); 5291 5292 // A uniform memory op is itself uniform. We exclude uniform stores 5293 // here as they demand the last lane, not the first one. 5294 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5295 assert(WideningDecision == CM_Scalarize); 5296 return true; 5297 } 5298 5299 return (WideningDecision == CM_Widen || 5300 WideningDecision == CM_Widen_Reverse || 5301 WideningDecision == CM_Interleave); 5302 }; 5303 5304 5305 // Returns true if Ptr is the pointer operand of a memory access instruction 5306 // I, and I is known to not require scalarization. 5307 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5308 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5309 }; 5310 5311 // Holds a list of values which are known to have at least one uniform use. 5312 // Note that there may be other uses which aren't uniform. A "uniform use" 5313 // here is something which only demands lane 0 of the unrolled iterations; 5314 // it does not imply that all lanes produce the same value (e.g. this is not 5315 // the usual meaning of uniform) 5316 SmallPtrSet<Value *, 8> HasUniformUse; 5317 5318 // Scan the loop for instructions which are either a) known to have only 5319 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5320 for (auto *BB : TheLoop->blocks()) 5321 for (auto &I : *BB) { 5322 // If there's no pointer operand, there's nothing to do. 5323 auto *Ptr = getLoadStorePointerOperand(&I); 5324 if (!Ptr) 5325 continue; 5326 5327 // A uniform memory op is itself uniform. We exclude uniform stores 5328 // here as they demand the last lane, not the first one. 5329 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5330 addToWorklistIfAllowed(&I); 5331 5332 if (isUniformDecision(&I, VF)) { 5333 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5334 HasUniformUse.insert(Ptr); 5335 } 5336 } 5337 5338 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5339 // demanding) users. Since loops are assumed to be in LCSSA form, this 5340 // disallows uses outside the loop as well. 5341 for (auto *V : HasUniformUse) { 5342 if (isOutOfScope(V)) 5343 continue; 5344 auto *I = cast<Instruction>(V); 5345 auto UsersAreMemAccesses = 5346 llvm::all_of(I->users(), [&](User *U) -> bool { 5347 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5348 }); 5349 if (UsersAreMemAccesses) 5350 addToWorklistIfAllowed(I); 5351 } 5352 5353 // Expand Worklist in topological order: whenever a new instruction 5354 // is added , its users should be already inside Worklist. It ensures 5355 // a uniform instruction will only be used by uniform instructions. 5356 unsigned idx = 0; 5357 while (idx != Worklist.size()) { 5358 Instruction *I = Worklist[idx++]; 5359 5360 for (auto OV : I->operand_values()) { 5361 // isOutOfScope operands cannot be uniform instructions. 5362 if (isOutOfScope(OV)) 5363 continue; 5364 // First order recurrence Phi's should typically be considered 5365 // non-uniform. 5366 auto *OP = dyn_cast<PHINode>(OV); 5367 if (OP && Legal->isFirstOrderRecurrence(OP)) 5368 continue; 5369 // If all the users of the operand are uniform, then add the 5370 // operand into the uniform worklist. 5371 auto *OI = cast<Instruction>(OV); 5372 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5373 auto *J = cast<Instruction>(U); 5374 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5375 })) 5376 addToWorklistIfAllowed(OI); 5377 } 5378 } 5379 5380 // For an instruction to be added into Worklist above, all its users inside 5381 // the loop should also be in Worklist. However, this condition cannot be 5382 // true for phi nodes that form a cyclic dependence. We must process phi 5383 // nodes separately. An induction variable will remain uniform if all users 5384 // of the induction variable and induction variable update remain uniform. 5385 // The code below handles both pointer and non-pointer induction variables. 5386 for (auto &Induction : Legal->getInductionVars()) { 5387 auto *Ind = Induction.first; 5388 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5389 5390 // Determine if all users of the induction variable are uniform after 5391 // vectorization. 5392 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5393 auto *I = cast<Instruction>(U); 5394 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5395 isVectorizedMemAccessUse(I, Ind); 5396 }); 5397 if (!UniformInd) 5398 continue; 5399 5400 // Determine if all users of the induction variable update instruction are 5401 // uniform after vectorization. 5402 auto UniformIndUpdate = 5403 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5404 auto *I = cast<Instruction>(U); 5405 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5406 isVectorizedMemAccessUse(I, IndUpdate); 5407 }); 5408 if (!UniformIndUpdate) 5409 continue; 5410 5411 // The induction variable and its update instruction will remain uniform. 5412 addToWorklistIfAllowed(Ind); 5413 addToWorklistIfAllowed(IndUpdate); 5414 } 5415 5416 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5417 } 5418 5419 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5420 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5421 5422 if (Legal->getRuntimePointerChecking()->Need) { 5423 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5424 "runtime pointer checks needed. Enable vectorization of this " 5425 "loop with '#pragma clang loop vectorize(enable)' when " 5426 "compiling with -Os/-Oz", 5427 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5428 return true; 5429 } 5430 5431 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5432 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5433 "runtime SCEV checks needed. Enable vectorization of this " 5434 "loop with '#pragma clang loop vectorize(enable)' when " 5435 "compiling with -Os/-Oz", 5436 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5437 return true; 5438 } 5439 5440 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5441 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5442 reportVectorizationFailure("Runtime stride check for small trip count", 5443 "runtime stride == 1 checks needed. Enable vectorization of " 5444 "this loop without such check by compiling with -Os/-Oz", 5445 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5446 return true; 5447 } 5448 5449 return false; 5450 } 5451 5452 Optional<ElementCount> 5453 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5454 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5455 // TODO: It may by useful to do since it's still likely to be dynamically 5456 // uniform if the target can skip. 5457 reportVectorizationFailure( 5458 "Not inserting runtime ptr check for divergent target", 5459 "runtime pointer checks needed. Not enabled for divergent target", 5460 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5461 return None; 5462 } 5463 5464 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5465 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5466 if (TC == 1) { 5467 reportVectorizationFailure("Single iteration (non) loop", 5468 "loop trip count is one, irrelevant for vectorization", 5469 "SingleIterationLoop", ORE, TheLoop); 5470 return None; 5471 } 5472 5473 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5474 5475 switch (ScalarEpilogueStatus) { 5476 case CM_ScalarEpilogueAllowed: 5477 return MaxVF; 5478 case CM_ScalarEpilogueNotAllowedUsePredicate: 5479 LLVM_FALLTHROUGH; 5480 case CM_ScalarEpilogueNotNeededUsePredicate: 5481 LLVM_DEBUG( 5482 dbgs() << "LV: vector predicate hint/switch found.\n" 5483 << "LV: Not allowing scalar epilogue, creating predicated " 5484 << "vector loop.\n"); 5485 break; 5486 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5487 // fallthrough as a special case of OptForSize 5488 case CM_ScalarEpilogueNotAllowedOptSize: 5489 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5490 LLVM_DEBUG( 5491 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5492 else 5493 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5494 << "count.\n"); 5495 5496 // Bail if runtime checks are required, which are not good when optimising 5497 // for size. 5498 if (runtimeChecksRequired()) 5499 return None; 5500 5501 break; 5502 } 5503 5504 // The only loops we can vectorize without a scalar epilogue, are loops with 5505 // a bottom-test and a single exiting block. We'd have to handle the fact 5506 // that not every instruction executes on the last iteration. This will 5507 // require a lane mask which varies through the vector loop body. (TODO) 5508 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5509 // If there was a tail-folding hint/switch, but we can't fold the tail by 5510 // masking, fallback to a vectorization with a scalar epilogue. 5511 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5512 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5513 "scalar epilogue instead.\n"); 5514 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5515 return MaxVF; 5516 } 5517 return None; 5518 } 5519 5520 // Now try the tail folding 5521 5522 // Invalidate interleave groups that require an epilogue if we can't mask 5523 // the interleave-group. 5524 if (!useMaskedInterleavedAccesses(TTI)) { 5525 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5526 "No decisions should have been taken at this point"); 5527 // Note: There is no need to invalidate any cost modeling decisions here, as 5528 // non where taken so far. 5529 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5530 } 5531 5532 assert(!MaxVF.isScalable() && 5533 "Scalable vectors do not yet support tail folding"); 5534 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5535 "MaxVF must be a power of 2"); 5536 unsigned MaxVFtimesIC = 5537 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5538 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5539 // chose. 5540 ScalarEvolution *SE = PSE.getSE(); 5541 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5542 const SCEV *ExitCount = SE->getAddExpr( 5543 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5544 const SCEV *Rem = SE->getURemExpr( 5545 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5546 if (Rem->isZero()) { 5547 // Accept MaxVF if we do not have a tail. 5548 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5549 return MaxVF; 5550 } 5551 5552 // If we don't know the precise trip count, or if the trip count that we 5553 // found modulo the vectorization factor is not zero, try to fold the tail 5554 // by masking. 5555 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5556 if (Legal->prepareToFoldTailByMasking()) { 5557 FoldTailByMasking = true; 5558 return MaxVF; 5559 } 5560 5561 // If there was a tail-folding hint/switch, but we can't fold the tail by 5562 // masking, fallback to a vectorization with a scalar epilogue. 5563 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5564 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5565 "scalar epilogue instead.\n"); 5566 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5567 return MaxVF; 5568 } 5569 5570 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5571 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5572 return None; 5573 } 5574 5575 if (TC == 0) { 5576 reportVectorizationFailure( 5577 "Unable to calculate the loop count due to complex control flow", 5578 "unable to calculate the loop count due to complex control flow", 5579 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5580 return None; 5581 } 5582 5583 reportVectorizationFailure( 5584 "Cannot optimize for size and vectorize at the same time.", 5585 "cannot optimize for size and vectorize at the same time. " 5586 "Enable vectorization of this loop with '#pragma clang loop " 5587 "vectorize(enable)' when compiling with -Os/-Oz", 5588 "NoTailLoopWithOptForSize", ORE, TheLoop); 5589 return None; 5590 } 5591 5592 ElementCount 5593 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5594 ElementCount UserVF) { 5595 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5596 unsigned SmallestType, WidestType; 5597 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5598 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5599 5600 // Get the maximum safe dependence distance in bits computed by LAA. 5601 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5602 // the memory accesses that is most restrictive (involved in the smallest 5603 // dependence distance). 5604 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5605 5606 if (UserVF.isNonZero()) { 5607 // For now, don't verify legality of scalable vectors. 5608 // This will be addressed properly in https://reviews.llvm.org/D91718. 5609 if (UserVF.isScalable()) 5610 return UserVF; 5611 5612 // If legally unsafe, clamp the user vectorization factor to a safe value. 5613 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5614 if (UserVF.getFixedValue() <= MaxSafeVF) 5615 return UserVF; 5616 5617 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5618 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5619 << ".\n"); 5620 ORE->emit([&]() { 5621 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5622 TheLoop->getStartLoc(), 5623 TheLoop->getHeader()) 5624 << "User-specified vectorization factor " 5625 << ore::NV("UserVectorizationFactor", UserVF) 5626 << " is unsafe, clamping to maximum safe vectorization factor " 5627 << ore::NV("VectorizationFactor", MaxSafeVF); 5628 }); 5629 return ElementCount::getFixed(MaxSafeVF); 5630 } 5631 5632 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5633 5634 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5635 // Note that both WidestRegister and WidestType may not be a powers of 2. 5636 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5637 5638 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5639 << " / " << WidestType << " bits.\n"); 5640 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5641 << WidestRegister << " bits.\n"); 5642 5643 assert(MaxVectorSize <= WidestRegister && 5644 "Did not expect to pack so many elements" 5645 " into one vector!"); 5646 if (MaxVectorSize == 0) { 5647 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5648 MaxVectorSize = 1; 5649 return ElementCount::getFixed(MaxVectorSize); 5650 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5651 isPowerOf2_32(ConstTripCount)) { 5652 // We need to clamp the VF to be the ConstTripCount. There is no point in 5653 // choosing a higher viable VF as done in the loop below. 5654 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5655 << ConstTripCount << "\n"); 5656 MaxVectorSize = ConstTripCount; 5657 return ElementCount::getFixed(MaxVectorSize); 5658 } 5659 5660 unsigned MaxVF = MaxVectorSize; 5661 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5662 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5663 // Collect all viable vectorization factors larger than the default MaxVF 5664 // (i.e. MaxVectorSize). 5665 SmallVector<ElementCount, 8> VFs; 5666 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5667 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5668 VFs.push_back(ElementCount::getFixed(VS)); 5669 5670 // For each VF calculate its register usage. 5671 auto RUs = calculateRegisterUsage(VFs); 5672 5673 // Select the largest VF which doesn't require more registers than existing 5674 // ones. 5675 for (int i = RUs.size() - 1; i >= 0; --i) { 5676 bool Selected = true; 5677 for (auto& pair : RUs[i].MaxLocalUsers) { 5678 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5679 if (pair.second > TargetNumRegisters) 5680 Selected = false; 5681 } 5682 if (Selected) { 5683 MaxVF = VFs[i].getKnownMinValue(); 5684 break; 5685 } 5686 } 5687 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5688 if (MaxVF < MinVF) { 5689 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5690 << ") with target's minimum: " << MinVF << '\n'); 5691 MaxVF = MinVF; 5692 } 5693 } 5694 } 5695 return ElementCount::getFixed(MaxVF); 5696 } 5697 5698 VectorizationFactor 5699 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5700 // FIXME: This can be fixed for scalable vectors later, because at this stage 5701 // the LoopVectorizer will only consider vectorizing a loop with scalable 5702 // vectors when the loop has a hint to enable vectorization for a given VF. 5703 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5704 5705 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5706 const float ScalarCost = Cost; 5707 unsigned Width = 1; 5708 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5709 5710 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5711 if (ForceVectorization && MaxVF.isVector()) { 5712 // Ignore scalar width, because the user explicitly wants vectorization. 5713 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5714 // evaluation. 5715 Cost = std::numeric_limits<float>::max(); 5716 } 5717 5718 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5719 // Notice that the vector loop needs to be executed less times, so 5720 // we need to divide the cost of the vector loops by the width of 5721 // the vector elements. 5722 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5723 float VectorCost = C.first / (float)i; 5724 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5725 << " costs: " << (int)VectorCost << ".\n"); 5726 if (!C.second && !ForceVectorization) { 5727 LLVM_DEBUG( 5728 dbgs() << "LV: Not considering vector loop of width " << i 5729 << " because it will not generate any vector instructions.\n"); 5730 continue; 5731 } 5732 5733 // If profitable add it to ProfitableVF list. 5734 if (VectorCost < ScalarCost) { 5735 ProfitableVFs.push_back(VectorizationFactor( 5736 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5737 } 5738 5739 if (VectorCost < Cost) { 5740 Cost = VectorCost; 5741 Width = i; 5742 } 5743 } 5744 5745 if (!EnableCondStoresVectorization && NumPredStores) { 5746 reportVectorizationFailure("There are conditional stores.", 5747 "store that is conditionally executed prevents vectorization", 5748 "ConditionalStore", ORE, TheLoop); 5749 Width = 1; 5750 Cost = ScalarCost; 5751 } 5752 5753 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5754 << "LV: Vectorization seems to be not beneficial, " 5755 << "but was forced by a user.\n"); 5756 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5757 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5758 (unsigned)(Width * Cost)}; 5759 return Factor; 5760 } 5761 5762 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5763 const Loop &L, ElementCount VF) const { 5764 // Cross iteration phis such as reductions need special handling and are 5765 // currently unsupported. 5766 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5767 return Legal->isFirstOrderRecurrence(&Phi) || 5768 Legal->isReductionVariable(&Phi); 5769 })) 5770 return false; 5771 5772 // Phis with uses outside of the loop require special handling and are 5773 // currently unsupported. 5774 for (auto &Entry : Legal->getInductionVars()) { 5775 // Look for uses of the value of the induction at the last iteration. 5776 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5777 for (User *U : PostInc->users()) 5778 if (!L.contains(cast<Instruction>(U))) 5779 return false; 5780 // Look for uses of penultimate value of the induction. 5781 for (User *U : Entry.first->users()) 5782 if (!L.contains(cast<Instruction>(U))) 5783 return false; 5784 } 5785 5786 // Induction variables that are widened require special handling that is 5787 // currently not supported. 5788 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5789 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5790 this->isProfitableToScalarize(Entry.first, VF)); 5791 })) 5792 return false; 5793 5794 return true; 5795 } 5796 5797 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5798 const ElementCount VF) const { 5799 // FIXME: We need a much better cost-model to take different parameters such 5800 // as register pressure, code size increase and cost of extra branches into 5801 // account. For now we apply a very crude heuristic and only consider loops 5802 // with vectorization factors larger than a certain value. 5803 // We also consider epilogue vectorization unprofitable for targets that don't 5804 // consider interleaving beneficial (eg. MVE). 5805 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5806 return false; 5807 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5808 return true; 5809 return false; 5810 } 5811 5812 VectorizationFactor 5813 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5814 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5815 VectorizationFactor Result = VectorizationFactor::Disabled(); 5816 if (!EnableEpilogueVectorization) { 5817 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5818 return Result; 5819 } 5820 5821 if (!isScalarEpilogueAllowed()) { 5822 LLVM_DEBUG( 5823 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5824 "allowed.\n";); 5825 return Result; 5826 } 5827 5828 // FIXME: This can be fixed for scalable vectors later, because at this stage 5829 // the LoopVectorizer will only consider vectorizing a loop with scalable 5830 // vectors when the loop has a hint to enable vectorization for a given VF. 5831 if (MainLoopVF.isScalable()) { 5832 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5833 "yet supported.\n"); 5834 return Result; 5835 } 5836 5837 // Not really a cost consideration, but check for unsupported cases here to 5838 // simplify the logic. 5839 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5840 LLVM_DEBUG( 5841 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5842 "not a supported candidate.\n";); 5843 return Result; 5844 } 5845 5846 if (EpilogueVectorizationForceVF > 1) { 5847 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5848 if (LVP.hasPlanWithVFs( 5849 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5850 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5851 else { 5852 LLVM_DEBUG( 5853 dbgs() 5854 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5855 return Result; 5856 } 5857 } 5858 5859 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5860 TheLoop->getHeader()->getParent()->hasMinSize()) { 5861 LLVM_DEBUG( 5862 dbgs() 5863 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5864 return Result; 5865 } 5866 5867 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5868 return Result; 5869 5870 for (auto &NextVF : ProfitableVFs) 5871 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5872 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5873 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5874 Result = NextVF; 5875 5876 if (Result != VectorizationFactor::Disabled()) 5877 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5878 << Result.Width.getFixedValue() << "\n";); 5879 return Result; 5880 } 5881 5882 std::pair<unsigned, unsigned> 5883 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5884 unsigned MinWidth = -1U; 5885 unsigned MaxWidth = 8; 5886 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5887 5888 // For each block. 5889 for (BasicBlock *BB : TheLoop->blocks()) { 5890 // For each instruction in the loop. 5891 for (Instruction &I : BB->instructionsWithoutDebug()) { 5892 Type *T = I.getType(); 5893 5894 // Skip ignored values. 5895 if (ValuesToIgnore.count(&I)) 5896 continue; 5897 5898 // Only examine Loads, Stores and PHINodes. 5899 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5900 continue; 5901 5902 // Examine PHI nodes that are reduction variables. Update the type to 5903 // account for the recurrence type. 5904 if (auto *PN = dyn_cast<PHINode>(&I)) { 5905 if (!Legal->isReductionVariable(PN)) 5906 continue; 5907 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5908 T = RdxDesc.getRecurrenceType(); 5909 } 5910 5911 // Examine the stored values. 5912 if (auto *ST = dyn_cast<StoreInst>(&I)) 5913 T = ST->getValueOperand()->getType(); 5914 5915 // Ignore loaded pointer types and stored pointer types that are not 5916 // vectorizable. 5917 // 5918 // FIXME: The check here attempts to predict whether a load or store will 5919 // be vectorized. We only know this for certain after a VF has 5920 // been selected. Here, we assume that if an access can be 5921 // vectorized, it will be. We should also look at extending this 5922 // optimization to non-pointer types. 5923 // 5924 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5925 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5926 continue; 5927 5928 MinWidth = std::min(MinWidth, 5929 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5930 MaxWidth = std::max(MaxWidth, 5931 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5932 } 5933 } 5934 5935 return {MinWidth, MaxWidth}; 5936 } 5937 5938 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5939 unsigned LoopCost) { 5940 // -- The interleave heuristics -- 5941 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5942 // There are many micro-architectural considerations that we can't predict 5943 // at this level. For example, frontend pressure (on decode or fetch) due to 5944 // code size, or the number and capabilities of the execution ports. 5945 // 5946 // We use the following heuristics to select the interleave count: 5947 // 1. If the code has reductions, then we interleave to break the cross 5948 // iteration dependency. 5949 // 2. If the loop is really small, then we interleave to reduce the loop 5950 // overhead. 5951 // 3. We don't interleave if we think that we will spill registers to memory 5952 // due to the increased register pressure. 5953 5954 if (!isScalarEpilogueAllowed()) 5955 return 1; 5956 5957 // We used the distance for the interleave count. 5958 if (Legal->getMaxSafeDepDistBytes() != -1U) 5959 return 1; 5960 5961 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5962 const bool HasReductions = !Legal->getReductionVars().empty(); 5963 // Do not interleave loops with a relatively small known or estimated trip 5964 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5965 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5966 // because with the above conditions interleaving can expose ILP and break 5967 // cross iteration dependences for reductions. 5968 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5969 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5970 return 1; 5971 5972 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5973 // We divide by these constants so assume that we have at least one 5974 // instruction that uses at least one register. 5975 for (auto& pair : R.MaxLocalUsers) { 5976 pair.second = std::max(pair.second, 1U); 5977 } 5978 5979 // We calculate the interleave count using the following formula. 5980 // Subtract the number of loop invariants from the number of available 5981 // registers. These registers are used by all of the interleaved instances. 5982 // Next, divide the remaining registers by the number of registers that is 5983 // required by the loop, in order to estimate how many parallel instances 5984 // fit without causing spills. All of this is rounded down if necessary to be 5985 // a power of two. We want power of two interleave count to simplify any 5986 // addressing operations or alignment considerations. 5987 // We also want power of two interleave counts to ensure that the induction 5988 // variable of the vector loop wraps to zero, when tail is folded by masking; 5989 // this currently happens when OptForSize, in which case IC is set to 1 above. 5990 unsigned IC = UINT_MAX; 5991 5992 for (auto& pair : R.MaxLocalUsers) { 5993 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5994 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5995 << " registers of " 5996 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5997 if (VF.isScalar()) { 5998 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5999 TargetNumRegisters = ForceTargetNumScalarRegs; 6000 } else { 6001 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6002 TargetNumRegisters = ForceTargetNumVectorRegs; 6003 } 6004 unsigned MaxLocalUsers = pair.second; 6005 unsigned LoopInvariantRegs = 0; 6006 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6007 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6008 6009 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6010 // Don't count the induction variable as interleaved. 6011 if (EnableIndVarRegisterHeur) { 6012 TmpIC = 6013 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6014 std::max(1U, (MaxLocalUsers - 1))); 6015 } 6016 6017 IC = std::min(IC, TmpIC); 6018 } 6019 6020 // Clamp the interleave ranges to reasonable counts. 6021 unsigned MaxInterleaveCount = 6022 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6023 6024 // Check if the user has overridden the max. 6025 if (VF.isScalar()) { 6026 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6027 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6028 } else { 6029 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6030 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6031 } 6032 6033 // If trip count is known or estimated compile time constant, limit the 6034 // interleave count to be less than the trip count divided by VF, provided it 6035 // is at least 1. 6036 // 6037 // For scalable vectors we can't know if interleaving is beneficial. It may 6038 // not be beneficial for small loops if none of the lanes in the second vector 6039 // iterations is enabled. However, for larger loops, there is likely to be a 6040 // similar benefit as for fixed-width vectors. For now, we choose to leave 6041 // the InterleaveCount as if vscale is '1', although if some information about 6042 // the vector is known (e.g. min vector size), we can make a better decision. 6043 if (BestKnownTC) { 6044 MaxInterleaveCount = 6045 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6046 // Make sure MaxInterleaveCount is greater than 0. 6047 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6048 } 6049 6050 assert(MaxInterleaveCount > 0 && 6051 "Maximum interleave count must be greater than 0"); 6052 6053 // Clamp the calculated IC to be between the 1 and the max interleave count 6054 // that the target and trip count allows. 6055 if (IC > MaxInterleaveCount) 6056 IC = MaxInterleaveCount; 6057 else 6058 // Make sure IC is greater than 0. 6059 IC = std::max(1u, IC); 6060 6061 assert(IC > 0 && "Interleave count must be greater than 0."); 6062 6063 // If we did not calculate the cost for VF (because the user selected the VF) 6064 // then we calculate the cost of VF here. 6065 if (LoopCost == 0) 6066 LoopCost = expectedCost(VF).first; 6067 6068 assert(LoopCost && "Non-zero loop cost expected"); 6069 6070 // Interleave if we vectorized this loop and there is a reduction that could 6071 // benefit from interleaving. 6072 if (VF.isVector() && HasReductions) { 6073 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6074 return IC; 6075 } 6076 6077 // Note that if we've already vectorized the loop we will have done the 6078 // runtime check and so interleaving won't require further checks. 6079 bool InterleavingRequiresRuntimePointerCheck = 6080 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6081 6082 // We want to interleave small loops in order to reduce the loop overhead and 6083 // potentially expose ILP opportunities. 6084 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6085 << "LV: IC is " << IC << '\n' 6086 << "LV: VF is " << VF << '\n'); 6087 const bool AggressivelyInterleaveReductions = 6088 TTI.enableAggressiveInterleaving(HasReductions); 6089 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6090 // We assume that the cost overhead is 1 and we use the cost model 6091 // to estimate the cost of the loop and interleave until the cost of the 6092 // loop overhead is about 5% of the cost of the loop. 6093 unsigned SmallIC = 6094 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6095 6096 // Interleave until store/load ports (estimated by max interleave count) are 6097 // saturated. 6098 unsigned NumStores = Legal->getNumStores(); 6099 unsigned NumLoads = Legal->getNumLoads(); 6100 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6101 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6102 6103 // If we have a scalar reduction (vector reductions are already dealt with 6104 // by this point), we can increase the critical path length if the loop 6105 // we're interleaving is inside another loop. Limit, by default to 2, so the 6106 // critical path only gets increased by one reduction operation. 6107 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6108 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6109 SmallIC = std::min(SmallIC, F); 6110 StoresIC = std::min(StoresIC, F); 6111 LoadsIC = std::min(LoadsIC, F); 6112 } 6113 6114 if (EnableLoadStoreRuntimeInterleave && 6115 std::max(StoresIC, LoadsIC) > SmallIC) { 6116 LLVM_DEBUG( 6117 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6118 return std::max(StoresIC, LoadsIC); 6119 } 6120 6121 // If there are scalar reductions and TTI has enabled aggressive 6122 // interleaving for reductions, we will interleave to expose ILP. 6123 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6124 AggressivelyInterleaveReductions) { 6125 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6126 // Interleave no less than SmallIC but not as aggressive as the normal IC 6127 // to satisfy the rare situation when resources are too limited. 6128 return std::max(IC / 2, SmallIC); 6129 } else { 6130 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6131 return SmallIC; 6132 } 6133 } 6134 6135 // Interleave if this is a large loop (small loops are already dealt with by 6136 // this point) that could benefit from interleaving. 6137 if (AggressivelyInterleaveReductions) { 6138 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6139 return IC; 6140 } 6141 6142 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6143 return 1; 6144 } 6145 6146 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6147 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6148 // This function calculates the register usage by measuring the highest number 6149 // of values that are alive at a single location. Obviously, this is a very 6150 // rough estimation. We scan the loop in a topological order in order and 6151 // assign a number to each instruction. We use RPO to ensure that defs are 6152 // met before their users. We assume that each instruction that has in-loop 6153 // users starts an interval. We record every time that an in-loop value is 6154 // used, so we have a list of the first and last occurrences of each 6155 // instruction. Next, we transpose this data structure into a multi map that 6156 // holds the list of intervals that *end* at a specific location. This multi 6157 // map allows us to perform a linear search. We scan the instructions linearly 6158 // and record each time that a new interval starts, by placing it in a set. 6159 // If we find this value in the multi-map then we remove it from the set. 6160 // The max register usage is the maximum size of the set. 6161 // We also search for instructions that are defined outside the loop, but are 6162 // used inside the loop. We need this number separately from the max-interval 6163 // usage number because when we unroll, loop-invariant values do not take 6164 // more register. 6165 LoopBlocksDFS DFS(TheLoop); 6166 DFS.perform(LI); 6167 6168 RegisterUsage RU; 6169 6170 // Each 'key' in the map opens a new interval. The values 6171 // of the map are the index of the 'last seen' usage of the 6172 // instruction that is the key. 6173 using IntervalMap = DenseMap<Instruction *, unsigned>; 6174 6175 // Maps instruction to its index. 6176 SmallVector<Instruction *, 64> IdxToInstr; 6177 // Marks the end of each interval. 6178 IntervalMap EndPoint; 6179 // Saves the list of instruction indices that are used in the loop. 6180 SmallPtrSet<Instruction *, 8> Ends; 6181 // Saves the list of values that are used in the loop but are 6182 // defined outside the loop, such as arguments and constants. 6183 SmallPtrSet<Value *, 8> LoopInvariants; 6184 6185 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6186 for (Instruction &I : BB->instructionsWithoutDebug()) { 6187 IdxToInstr.push_back(&I); 6188 6189 // Save the end location of each USE. 6190 for (Value *U : I.operands()) { 6191 auto *Instr = dyn_cast<Instruction>(U); 6192 6193 // Ignore non-instruction values such as arguments, constants, etc. 6194 if (!Instr) 6195 continue; 6196 6197 // If this instruction is outside the loop then record it and continue. 6198 if (!TheLoop->contains(Instr)) { 6199 LoopInvariants.insert(Instr); 6200 continue; 6201 } 6202 6203 // Overwrite previous end points. 6204 EndPoint[Instr] = IdxToInstr.size(); 6205 Ends.insert(Instr); 6206 } 6207 } 6208 } 6209 6210 // Saves the list of intervals that end with the index in 'key'. 6211 using InstrList = SmallVector<Instruction *, 2>; 6212 DenseMap<unsigned, InstrList> TransposeEnds; 6213 6214 // Transpose the EndPoints to a list of values that end at each index. 6215 for (auto &Interval : EndPoint) 6216 TransposeEnds[Interval.second].push_back(Interval.first); 6217 6218 SmallPtrSet<Instruction *, 8> OpenIntervals; 6219 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6220 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6221 6222 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6223 6224 // A lambda that gets the register usage for the given type and VF. 6225 const auto &TTICapture = TTI; 6226 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6227 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6228 return 0U; 6229 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6230 }; 6231 6232 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6233 Instruction *I = IdxToInstr[i]; 6234 6235 // Remove all of the instructions that end at this location. 6236 InstrList &List = TransposeEnds[i]; 6237 for (Instruction *ToRemove : List) 6238 OpenIntervals.erase(ToRemove); 6239 6240 // Ignore instructions that are never used within the loop. 6241 if (!Ends.count(I)) 6242 continue; 6243 6244 // Skip ignored values. 6245 if (ValuesToIgnore.count(I)) 6246 continue; 6247 6248 // For each VF find the maximum usage of registers. 6249 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6250 // Count the number of live intervals. 6251 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6252 6253 if (VFs[j].isScalar()) { 6254 for (auto Inst : OpenIntervals) { 6255 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6256 if (RegUsage.find(ClassID) == RegUsage.end()) 6257 RegUsage[ClassID] = 1; 6258 else 6259 RegUsage[ClassID] += 1; 6260 } 6261 } else { 6262 collectUniformsAndScalars(VFs[j]); 6263 for (auto Inst : OpenIntervals) { 6264 // Skip ignored values for VF > 1. 6265 if (VecValuesToIgnore.count(Inst)) 6266 continue; 6267 if (isScalarAfterVectorization(Inst, VFs[j])) { 6268 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6269 if (RegUsage.find(ClassID) == RegUsage.end()) 6270 RegUsage[ClassID] = 1; 6271 else 6272 RegUsage[ClassID] += 1; 6273 } else { 6274 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6275 if (RegUsage.find(ClassID) == RegUsage.end()) 6276 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6277 else 6278 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6279 } 6280 } 6281 } 6282 6283 for (auto& pair : RegUsage) { 6284 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6285 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6286 else 6287 MaxUsages[j][pair.first] = pair.second; 6288 } 6289 } 6290 6291 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6292 << OpenIntervals.size() << '\n'); 6293 6294 // Add the current instruction to the list of open intervals. 6295 OpenIntervals.insert(I); 6296 } 6297 6298 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6299 SmallMapVector<unsigned, unsigned, 4> Invariant; 6300 6301 for (auto Inst : LoopInvariants) { 6302 unsigned Usage = 6303 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6304 unsigned ClassID = 6305 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6306 if (Invariant.find(ClassID) == Invariant.end()) 6307 Invariant[ClassID] = Usage; 6308 else 6309 Invariant[ClassID] += Usage; 6310 } 6311 6312 LLVM_DEBUG({ 6313 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6314 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6315 << " item\n"; 6316 for (const auto &pair : MaxUsages[i]) { 6317 dbgs() << "LV(REG): RegisterClass: " 6318 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6319 << " registers\n"; 6320 } 6321 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6322 << " item\n"; 6323 for (const auto &pair : Invariant) { 6324 dbgs() << "LV(REG): RegisterClass: " 6325 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6326 << " registers\n"; 6327 } 6328 }); 6329 6330 RU.LoopInvariantRegs = Invariant; 6331 RU.MaxLocalUsers = MaxUsages[i]; 6332 RUs[i] = RU; 6333 } 6334 6335 return RUs; 6336 } 6337 6338 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6339 // TODO: Cost model for emulated masked load/store is completely 6340 // broken. This hack guides the cost model to use an artificially 6341 // high enough value to practically disable vectorization with such 6342 // operations, except where previously deployed legality hack allowed 6343 // using very low cost values. This is to avoid regressions coming simply 6344 // from moving "masked load/store" check from legality to cost model. 6345 // Masked Load/Gather emulation was previously never allowed. 6346 // Limited number of Masked Store/Scatter emulation was allowed. 6347 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6348 return isa<LoadInst>(I) || 6349 (isa<StoreInst>(I) && 6350 NumPredStores > NumberOfStoresToPredicate); 6351 } 6352 6353 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6354 // If we aren't vectorizing the loop, or if we've already collected the 6355 // instructions to scalarize, there's nothing to do. Collection may already 6356 // have occurred if we have a user-selected VF and are now computing the 6357 // expected cost for interleaving. 6358 if (VF.isScalar() || VF.isZero() || 6359 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6360 return; 6361 6362 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6363 // not profitable to scalarize any instructions, the presence of VF in the 6364 // map will indicate that we've analyzed it already. 6365 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6366 6367 // Find all the instructions that are scalar with predication in the loop and 6368 // determine if it would be better to not if-convert the blocks they are in. 6369 // If so, we also record the instructions to scalarize. 6370 for (BasicBlock *BB : TheLoop->blocks()) { 6371 if (!blockNeedsPredication(BB)) 6372 continue; 6373 for (Instruction &I : *BB) 6374 if (isScalarWithPredication(&I)) { 6375 ScalarCostsTy ScalarCosts; 6376 // Do not apply discount logic if hacked cost is needed 6377 // for emulated masked memrefs. 6378 if (!useEmulatedMaskMemRefHack(&I) && 6379 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6380 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6381 // Remember that BB will remain after vectorization. 6382 PredicatedBBsAfterVectorization.insert(BB); 6383 } 6384 } 6385 } 6386 6387 int LoopVectorizationCostModel::computePredInstDiscount( 6388 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6389 ElementCount VF) { 6390 assert(!isUniformAfterVectorization(PredInst, VF) && 6391 "Instruction marked uniform-after-vectorization will be predicated"); 6392 6393 // Initialize the discount to zero, meaning that the scalar version and the 6394 // vector version cost the same. 6395 int Discount = 0; 6396 6397 // Holds instructions to analyze. The instructions we visit are mapped in 6398 // ScalarCosts. Those instructions are the ones that would be scalarized if 6399 // we find that the scalar version costs less. 6400 SmallVector<Instruction *, 8> Worklist; 6401 6402 // Returns true if the given instruction can be scalarized. 6403 auto canBeScalarized = [&](Instruction *I) -> bool { 6404 // We only attempt to scalarize instructions forming a single-use chain 6405 // from the original predicated block that would otherwise be vectorized. 6406 // Although not strictly necessary, we give up on instructions we know will 6407 // already be scalar to avoid traversing chains that are unlikely to be 6408 // beneficial. 6409 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6410 isScalarAfterVectorization(I, VF)) 6411 return false; 6412 6413 // If the instruction is scalar with predication, it will be analyzed 6414 // separately. We ignore it within the context of PredInst. 6415 if (isScalarWithPredication(I)) 6416 return false; 6417 6418 // If any of the instruction's operands are uniform after vectorization, 6419 // the instruction cannot be scalarized. This prevents, for example, a 6420 // masked load from being scalarized. 6421 // 6422 // We assume we will only emit a value for lane zero of an instruction 6423 // marked uniform after vectorization, rather than VF identical values. 6424 // Thus, if we scalarize an instruction that uses a uniform, we would 6425 // create uses of values corresponding to the lanes we aren't emitting code 6426 // for. This behavior can be changed by allowing getScalarValue to clone 6427 // the lane zero values for uniforms rather than asserting. 6428 for (Use &U : I->operands()) 6429 if (auto *J = dyn_cast<Instruction>(U.get())) 6430 if (isUniformAfterVectorization(J, VF)) 6431 return false; 6432 6433 // Otherwise, we can scalarize the instruction. 6434 return true; 6435 }; 6436 6437 // Compute the expected cost discount from scalarizing the entire expression 6438 // feeding the predicated instruction. We currently only consider expressions 6439 // that are single-use instruction chains. 6440 Worklist.push_back(PredInst); 6441 while (!Worklist.empty()) { 6442 Instruction *I = Worklist.pop_back_val(); 6443 6444 // If we've already analyzed the instruction, there's nothing to do. 6445 if (ScalarCosts.find(I) != ScalarCosts.end()) 6446 continue; 6447 6448 // Compute the cost of the vector instruction. Note that this cost already 6449 // includes the scalarization overhead of the predicated instruction. 6450 unsigned VectorCost = getInstructionCost(I, VF).first; 6451 6452 // Compute the cost of the scalarized instruction. This cost is the cost of 6453 // the instruction as if it wasn't if-converted and instead remained in the 6454 // predicated block. We will scale this cost by block probability after 6455 // computing the scalarization overhead. 6456 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6457 unsigned ScalarCost = 6458 VF.getKnownMinValue() * 6459 getInstructionCost(I, ElementCount::getFixed(1)).first; 6460 6461 // Compute the scalarization overhead of needed insertelement instructions 6462 // and phi nodes. 6463 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6464 ScalarCost += TTI.getScalarizationOverhead( 6465 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6466 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6467 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6468 ScalarCost += 6469 VF.getKnownMinValue() * 6470 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6471 } 6472 6473 // Compute the scalarization overhead of needed extractelement 6474 // instructions. For each of the instruction's operands, if the operand can 6475 // be scalarized, add it to the worklist; otherwise, account for the 6476 // overhead. 6477 for (Use &U : I->operands()) 6478 if (auto *J = dyn_cast<Instruction>(U.get())) { 6479 assert(VectorType::isValidElementType(J->getType()) && 6480 "Instruction has non-scalar type"); 6481 if (canBeScalarized(J)) 6482 Worklist.push_back(J); 6483 else if (needsExtract(J, VF)) { 6484 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6485 ScalarCost += TTI.getScalarizationOverhead( 6486 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6487 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6488 } 6489 } 6490 6491 // Scale the total scalar cost by block probability. 6492 ScalarCost /= getReciprocalPredBlockProb(); 6493 6494 // Compute the discount. A non-negative discount means the vector version 6495 // of the instruction costs more, and scalarizing would be beneficial. 6496 Discount += VectorCost - ScalarCost; 6497 ScalarCosts[I] = ScalarCost; 6498 } 6499 6500 return Discount; 6501 } 6502 6503 LoopVectorizationCostModel::VectorizationCostTy 6504 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6505 VectorizationCostTy Cost; 6506 6507 // For each block. 6508 for (BasicBlock *BB : TheLoop->blocks()) { 6509 VectorizationCostTy BlockCost; 6510 6511 // For each instruction in the old loop. 6512 for (Instruction &I : BB->instructionsWithoutDebug()) { 6513 // Skip ignored values. 6514 if (ValuesToIgnore.count(&I) || 6515 (VF.isVector() && VecValuesToIgnore.count(&I))) 6516 continue; 6517 6518 VectorizationCostTy C = getInstructionCost(&I, VF); 6519 6520 // Check if we should override the cost. 6521 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6522 C.first = ForceTargetInstructionCost; 6523 6524 BlockCost.first += C.first; 6525 BlockCost.second |= C.second; 6526 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6527 << " for VF " << VF << " For instruction: " << I 6528 << '\n'); 6529 } 6530 6531 // If we are vectorizing a predicated block, it will have been 6532 // if-converted. This means that the block's instructions (aside from 6533 // stores and instructions that may divide by zero) will now be 6534 // unconditionally executed. For the scalar case, we may not always execute 6535 // the predicated block, if it is an if-else block. Thus, scale the block's 6536 // cost by the probability of executing it. blockNeedsPredication from 6537 // Legal is used so as to not include all blocks in tail folded loops. 6538 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6539 BlockCost.first /= getReciprocalPredBlockProb(); 6540 6541 Cost.first += BlockCost.first; 6542 Cost.second |= BlockCost.second; 6543 } 6544 6545 return Cost; 6546 } 6547 6548 /// Gets Address Access SCEV after verifying that the access pattern 6549 /// is loop invariant except the induction variable dependence. 6550 /// 6551 /// This SCEV can be sent to the Target in order to estimate the address 6552 /// calculation cost. 6553 static const SCEV *getAddressAccessSCEV( 6554 Value *Ptr, 6555 LoopVectorizationLegality *Legal, 6556 PredicatedScalarEvolution &PSE, 6557 const Loop *TheLoop) { 6558 6559 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6560 if (!Gep) 6561 return nullptr; 6562 6563 // We are looking for a gep with all loop invariant indices except for one 6564 // which should be an induction variable. 6565 auto SE = PSE.getSE(); 6566 unsigned NumOperands = Gep->getNumOperands(); 6567 for (unsigned i = 1; i < NumOperands; ++i) { 6568 Value *Opd = Gep->getOperand(i); 6569 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6570 !Legal->isInductionVariable(Opd)) 6571 return nullptr; 6572 } 6573 6574 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6575 return PSE.getSCEV(Ptr); 6576 } 6577 6578 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6579 return Legal->hasStride(I->getOperand(0)) || 6580 Legal->hasStride(I->getOperand(1)); 6581 } 6582 6583 unsigned 6584 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6585 ElementCount VF) { 6586 assert(VF.isVector() && 6587 "Scalarization cost of instruction implies vectorization."); 6588 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6589 Type *ValTy = getMemInstValueType(I); 6590 auto SE = PSE.getSE(); 6591 6592 unsigned AS = getLoadStoreAddressSpace(I); 6593 Value *Ptr = getLoadStorePointerOperand(I); 6594 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6595 6596 // Figure out whether the access is strided and get the stride value 6597 // if it's known in compile time 6598 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6599 6600 // Get the cost of the scalar memory instruction and address computation. 6601 unsigned Cost = 6602 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6603 6604 // Don't pass *I here, since it is scalar but will actually be part of a 6605 // vectorized loop where the user of it is a vectorized instruction. 6606 const Align Alignment = getLoadStoreAlignment(I); 6607 Cost += VF.getKnownMinValue() * 6608 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6609 AS, TTI::TCK_RecipThroughput); 6610 6611 // Get the overhead of the extractelement and insertelement instructions 6612 // we might create due to scalarization. 6613 Cost += getScalarizationOverhead(I, VF); 6614 6615 // If we have a predicated store, it may not be executed for each vector 6616 // lane. Scale the cost by the probability of executing the predicated 6617 // block. 6618 if (isPredicatedInst(I)) { 6619 Cost /= getReciprocalPredBlockProb(); 6620 6621 if (useEmulatedMaskMemRefHack(I)) 6622 // Artificially setting to a high enough value to practically disable 6623 // vectorization with such operations. 6624 Cost = 3000000; 6625 } 6626 6627 return Cost; 6628 } 6629 6630 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6631 ElementCount VF) { 6632 Type *ValTy = getMemInstValueType(I); 6633 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6634 Value *Ptr = getLoadStorePointerOperand(I); 6635 unsigned AS = getLoadStoreAddressSpace(I); 6636 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6637 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6638 6639 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6640 "Stride should be 1 or -1 for consecutive memory access"); 6641 const Align Alignment = getLoadStoreAlignment(I); 6642 unsigned Cost = 0; 6643 if (Legal->isMaskRequired(I)) 6644 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6645 CostKind); 6646 else 6647 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6648 CostKind, I); 6649 6650 bool Reverse = ConsecutiveStride < 0; 6651 if (Reverse) 6652 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6653 return Cost; 6654 } 6655 6656 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6657 ElementCount VF) { 6658 assert(Legal->isUniformMemOp(*I)); 6659 6660 Type *ValTy = getMemInstValueType(I); 6661 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6662 const Align Alignment = getLoadStoreAlignment(I); 6663 unsigned AS = getLoadStoreAddressSpace(I); 6664 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6665 if (isa<LoadInst>(I)) { 6666 return TTI.getAddressComputationCost(ValTy) + 6667 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6668 CostKind) + 6669 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6670 } 6671 StoreInst *SI = cast<StoreInst>(I); 6672 6673 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6674 return TTI.getAddressComputationCost(ValTy) + 6675 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6676 CostKind) + 6677 (isLoopInvariantStoreValue 6678 ? 0 6679 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6680 VF.getKnownMinValue() - 1)); 6681 } 6682 6683 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6684 ElementCount VF) { 6685 Type *ValTy = getMemInstValueType(I); 6686 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6687 const Align Alignment = getLoadStoreAlignment(I); 6688 const Value *Ptr = getLoadStorePointerOperand(I); 6689 6690 return TTI.getAddressComputationCost(VectorTy) + 6691 TTI.getGatherScatterOpCost( 6692 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6693 TargetTransformInfo::TCK_RecipThroughput, I); 6694 } 6695 6696 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6697 ElementCount VF) { 6698 Type *ValTy = getMemInstValueType(I); 6699 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6700 unsigned AS = getLoadStoreAddressSpace(I); 6701 6702 auto Group = getInterleavedAccessGroup(I); 6703 assert(Group && "Fail to get an interleaved access group."); 6704 6705 unsigned InterleaveFactor = Group->getFactor(); 6706 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6707 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6708 6709 // Holds the indices of existing members in an interleaved load group. 6710 // An interleaved store group doesn't need this as it doesn't allow gaps. 6711 SmallVector<unsigned, 4> Indices; 6712 if (isa<LoadInst>(I)) { 6713 for (unsigned i = 0; i < InterleaveFactor; i++) 6714 if (Group->getMember(i)) 6715 Indices.push_back(i); 6716 } 6717 6718 // Calculate the cost of the whole interleaved group. 6719 bool UseMaskForGaps = 6720 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6721 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6722 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6723 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6724 6725 if (Group->isReverse()) { 6726 // TODO: Add support for reversed masked interleaved access. 6727 assert(!Legal->isMaskRequired(I) && 6728 "Reverse masked interleaved access not supported."); 6729 Cost += Group->getNumMembers() * 6730 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6731 } 6732 return Cost; 6733 } 6734 6735 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6736 ElementCount VF) { 6737 // Calculate scalar cost only. Vectorization cost should be ready at this 6738 // moment. 6739 if (VF.isScalar()) { 6740 Type *ValTy = getMemInstValueType(I); 6741 const Align Alignment = getLoadStoreAlignment(I); 6742 unsigned AS = getLoadStoreAddressSpace(I); 6743 6744 return TTI.getAddressComputationCost(ValTy) + 6745 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6746 TTI::TCK_RecipThroughput, I); 6747 } 6748 return getWideningCost(I, VF); 6749 } 6750 6751 LoopVectorizationCostModel::VectorizationCostTy 6752 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6753 ElementCount VF) { 6754 // If we know that this instruction will remain uniform, check the cost of 6755 // the scalar version. 6756 if (isUniformAfterVectorization(I, VF)) 6757 VF = ElementCount::getFixed(1); 6758 6759 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6760 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6761 6762 // Forced scalars do not have any scalarization overhead. 6763 auto ForcedScalar = ForcedScalars.find(VF); 6764 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6765 auto InstSet = ForcedScalar->second; 6766 if (InstSet.count(I)) 6767 return VectorizationCostTy( 6768 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6769 VF.getKnownMinValue()), 6770 false); 6771 } 6772 6773 Type *VectorTy; 6774 unsigned C = getInstructionCost(I, VF, VectorTy); 6775 6776 bool TypeNotScalarized = 6777 VF.isVector() && VectorTy->isVectorTy() && 6778 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6779 return VectorizationCostTy(C, TypeNotScalarized); 6780 } 6781 6782 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6783 ElementCount VF) { 6784 6785 assert(!VF.isScalable() && 6786 "cannot compute scalarization overhead for scalable vectorization"); 6787 if (VF.isScalar()) 6788 return 0; 6789 6790 unsigned Cost = 0; 6791 Type *RetTy = ToVectorTy(I->getType(), VF); 6792 if (!RetTy->isVoidTy() && 6793 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6794 Cost += TTI.getScalarizationOverhead( 6795 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6796 true, false); 6797 6798 // Some targets keep addresses scalar. 6799 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6800 return Cost; 6801 6802 // Some targets support efficient element stores. 6803 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6804 return Cost; 6805 6806 // Collect operands to consider. 6807 CallInst *CI = dyn_cast<CallInst>(I); 6808 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6809 6810 // Skip operands that do not require extraction/scalarization and do not incur 6811 // any overhead. 6812 return Cost + TTI.getOperandsScalarizationOverhead( 6813 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6814 } 6815 6816 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6817 if (VF.isScalar()) 6818 return; 6819 NumPredStores = 0; 6820 for (BasicBlock *BB : TheLoop->blocks()) { 6821 // For each instruction in the old loop. 6822 for (Instruction &I : *BB) { 6823 Value *Ptr = getLoadStorePointerOperand(&I); 6824 if (!Ptr) 6825 continue; 6826 6827 // TODO: We should generate better code and update the cost model for 6828 // predicated uniform stores. Today they are treated as any other 6829 // predicated store (see added test cases in 6830 // invariant-store-vectorization.ll). 6831 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6832 NumPredStores++; 6833 6834 if (Legal->isUniformMemOp(I)) { 6835 // TODO: Avoid replicating loads and stores instead of 6836 // relying on instcombine to remove them. 6837 // Load: Scalar load + broadcast 6838 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6839 unsigned Cost = getUniformMemOpCost(&I, VF); 6840 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6841 continue; 6842 } 6843 6844 // We assume that widening is the best solution when possible. 6845 if (memoryInstructionCanBeWidened(&I, VF)) { 6846 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6847 int ConsecutiveStride = 6848 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6849 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6850 "Expected consecutive stride."); 6851 InstWidening Decision = 6852 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6853 setWideningDecision(&I, VF, Decision, Cost); 6854 continue; 6855 } 6856 6857 // Choose between Interleaving, Gather/Scatter or Scalarization. 6858 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6859 unsigned NumAccesses = 1; 6860 if (isAccessInterleaved(&I)) { 6861 auto Group = getInterleavedAccessGroup(&I); 6862 assert(Group && "Fail to get an interleaved access group."); 6863 6864 // Make one decision for the whole group. 6865 if (getWideningDecision(&I, VF) != CM_Unknown) 6866 continue; 6867 6868 NumAccesses = Group->getNumMembers(); 6869 if (interleavedAccessCanBeWidened(&I, VF)) 6870 InterleaveCost = getInterleaveGroupCost(&I, VF); 6871 } 6872 6873 unsigned GatherScatterCost = 6874 isLegalGatherOrScatter(&I) 6875 ? getGatherScatterCost(&I, VF) * NumAccesses 6876 : std::numeric_limits<unsigned>::max(); 6877 6878 unsigned ScalarizationCost = 6879 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6880 6881 // Choose better solution for the current VF, 6882 // write down this decision and use it during vectorization. 6883 unsigned Cost; 6884 InstWidening Decision; 6885 if (InterleaveCost <= GatherScatterCost && 6886 InterleaveCost < ScalarizationCost) { 6887 Decision = CM_Interleave; 6888 Cost = InterleaveCost; 6889 } else if (GatherScatterCost < ScalarizationCost) { 6890 Decision = CM_GatherScatter; 6891 Cost = GatherScatterCost; 6892 } else { 6893 Decision = CM_Scalarize; 6894 Cost = ScalarizationCost; 6895 } 6896 // If the instructions belongs to an interleave group, the whole group 6897 // receives the same decision. The whole group receives the cost, but 6898 // the cost will actually be assigned to one instruction. 6899 if (auto Group = getInterleavedAccessGroup(&I)) 6900 setWideningDecision(Group, VF, Decision, Cost); 6901 else 6902 setWideningDecision(&I, VF, Decision, Cost); 6903 } 6904 } 6905 6906 // Make sure that any load of address and any other address computation 6907 // remains scalar unless there is gather/scatter support. This avoids 6908 // inevitable extracts into address registers, and also has the benefit of 6909 // activating LSR more, since that pass can't optimize vectorized 6910 // addresses. 6911 if (TTI.prefersVectorizedAddressing()) 6912 return; 6913 6914 // Start with all scalar pointer uses. 6915 SmallPtrSet<Instruction *, 8> AddrDefs; 6916 for (BasicBlock *BB : TheLoop->blocks()) 6917 for (Instruction &I : *BB) { 6918 Instruction *PtrDef = 6919 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6920 if (PtrDef && TheLoop->contains(PtrDef) && 6921 getWideningDecision(&I, VF) != CM_GatherScatter) 6922 AddrDefs.insert(PtrDef); 6923 } 6924 6925 // Add all instructions used to generate the addresses. 6926 SmallVector<Instruction *, 4> Worklist; 6927 for (auto *I : AddrDefs) 6928 Worklist.push_back(I); 6929 while (!Worklist.empty()) { 6930 Instruction *I = Worklist.pop_back_val(); 6931 for (auto &Op : I->operands()) 6932 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6933 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6934 AddrDefs.insert(InstOp).second) 6935 Worklist.push_back(InstOp); 6936 } 6937 6938 for (auto *I : AddrDefs) { 6939 if (isa<LoadInst>(I)) { 6940 // Setting the desired widening decision should ideally be handled in 6941 // by cost functions, but since this involves the task of finding out 6942 // if the loaded register is involved in an address computation, it is 6943 // instead changed here when we know this is the case. 6944 InstWidening Decision = getWideningDecision(I, VF); 6945 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6946 // Scalarize a widened load of address. 6947 setWideningDecision( 6948 I, VF, CM_Scalarize, 6949 (VF.getKnownMinValue() * 6950 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6951 else if (auto Group = getInterleavedAccessGroup(I)) { 6952 // Scalarize an interleave group of address loads. 6953 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6954 if (Instruction *Member = Group->getMember(I)) 6955 setWideningDecision( 6956 Member, VF, CM_Scalarize, 6957 (VF.getKnownMinValue() * 6958 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6959 } 6960 } 6961 } else 6962 // Make sure I gets scalarized and a cost estimate without 6963 // scalarization overhead. 6964 ForcedScalars[VF].insert(I); 6965 } 6966 } 6967 6968 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6969 ElementCount VF, 6970 Type *&VectorTy) { 6971 Type *RetTy = I->getType(); 6972 if (canTruncateToMinimalBitwidth(I, VF)) 6973 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6974 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6975 auto SE = PSE.getSE(); 6976 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6977 6978 // TODO: We need to estimate the cost of intrinsic calls. 6979 switch (I->getOpcode()) { 6980 case Instruction::GetElementPtr: 6981 // We mark this instruction as zero-cost because the cost of GEPs in 6982 // vectorized code depends on whether the corresponding memory instruction 6983 // is scalarized or not. Therefore, we handle GEPs with the memory 6984 // instruction cost. 6985 return 0; 6986 case Instruction::Br: { 6987 // In cases of scalarized and predicated instructions, there will be VF 6988 // predicated blocks in the vectorized loop. Each branch around these 6989 // blocks requires also an extract of its vector compare i1 element. 6990 bool ScalarPredicatedBB = false; 6991 BranchInst *BI = cast<BranchInst>(I); 6992 if (VF.isVector() && BI->isConditional() && 6993 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6994 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6995 ScalarPredicatedBB = true; 6996 6997 if (ScalarPredicatedBB) { 6998 // Return cost for branches around scalarized and predicated blocks. 6999 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7000 auto *Vec_i1Ty = 7001 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7002 return (TTI.getScalarizationOverhead( 7003 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7004 false, true) + 7005 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7006 VF.getKnownMinValue())); 7007 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7008 // The back-edge branch will remain, as will all scalar branches. 7009 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7010 else 7011 // This branch will be eliminated by if-conversion. 7012 return 0; 7013 // Note: We currently assume zero cost for an unconditional branch inside 7014 // a predicated block since it will become a fall-through, although we 7015 // may decide in the future to call TTI for all branches. 7016 } 7017 case Instruction::PHI: { 7018 auto *Phi = cast<PHINode>(I); 7019 7020 // First-order recurrences are replaced by vector shuffles inside the loop. 7021 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7022 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7023 return TTI.getShuffleCost( 7024 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7025 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7026 7027 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7028 // converted into select instructions. We require N - 1 selects per phi 7029 // node, where N is the number of incoming values. 7030 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7031 return (Phi->getNumIncomingValues() - 1) * 7032 TTI.getCmpSelInstrCost( 7033 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7034 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7035 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7036 7037 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7038 } 7039 case Instruction::UDiv: 7040 case Instruction::SDiv: 7041 case Instruction::URem: 7042 case Instruction::SRem: 7043 // If we have a predicated instruction, it may not be executed for each 7044 // vector lane. Get the scalarization cost and scale this amount by the 7045 // probability of executing the predicated block. If the instruction is not 7046 // predicated, we fall through to the next case. 7047 if (VF.isVector() && isScalarWithPredication(I)) { 7048 unsigned Cost = 0; 7049 7050 // These instructions have a non-void type, so account for the phi nodes 7051 // that we will create. This cost is likely to be zero. The phi node 7052 // cost, if any, should be scaled by the block probability because it 7053 // models a copy at the end of each predicated block. 7054 Cost += VF.getKnownMinValue() * 7055 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7056 7057 // The cost of the non-predicated instruction. 7058 Cost += VF.getKnownMinValue() * 7059 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7060 7061 // The cost of insertelement and extractelement instructions needed for 7062 // scalarization. 7063 Cost += getScalarizationOverhead(I, VF); 7064 7065 // Scale the cost by the probability of executing the predicated blocks. 7066 // This assumes the predicated block for each vector lane is equally 7067 // likely. 7068 return Cost / getReciprocalPredBlockProb(); 7069 } 7070 LLVM_FALLTHROUGH; 7071 case Instruction::Add: 7072 case Instruction::FAdd: 7073 case Instruction::Sub: 7074 case Instruction::FSub: 7075 case Instruction::Mul: 7076 case Instruction::FMul: 7077 case Instruction::FDiv: 7078 case Instruction::FRem: 7079 case Instruction::Shl: 7080 case Instruction::LShr: 7081 case Instruction::AShr: 7082 case Instruction::And: 7083 case Instruction::Or: 7084 case Instruction::Xor: { 7085 // Since we will replace the stride by 1 the multiplication should go away. 7086 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7087 return 0; 7088 // Certain instructions can be cheaper to vectorize if they have a constant 7089 // second vector operand. One example of this are shifts on x86. 7090 Value *Op2 = I->getOperand(1); 7091 TargetTransformInfo::OperandValueProperties Op2VP; 7092 TargetTransformInfo::OperandValueKind Op2VK = 7093 TTI.getOperandInfo(Op2, Op2VP); 7094 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7095 Op2VK = TargetTransformInfo::OK_UniformValue; 7096 7097 SmallVector<const Value *, 4> Operands(I->operand_values()); 7098 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7099 return N * TTI.getArithmeticInstrCost( 7100 I->getOpcode(), VectorTy, CostKind, 7101 TargetTransformInfo::OK_AnyValue, 7102 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7103 } 7104 case Instruction::FNeg: { 7105 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7106 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7107 return N * TTI.getArithmeticInstrCost( 7108 I->getOpcode(), VectorTy, CostKind, 7109 TargetTransformInfo::OK_AnyValue, 7110 TargetTransformInfo::OK_AnyValue, 7111 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7112 I->getOperand(0), I); 7113 } 7114 case Instruction::Select: { 7115 SelectInst *SI = cast<SelectInst>(I); 7116 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7117 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7118 Type *CondTy = SI->getCondition()->getType(); 7119 if (!ScalarCond) { 7120 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7121 CondTy = VectorType::get(CondTy, VF); 7122 } 7123 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7124 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7125 } 7126 case Instruction::ICmp: 7127 case Instruction::FCmp: { 7128 Type *ValTy = I->getOperand(0)->getType(); 7129 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7130 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7131 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7132 VectorTy = ToVectorTy(ValTy, VF); 7133 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7134 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7135 } 7136 case Instruction::Store: 7137 case Instruction::Load: { 7138 ElementCount Width = VF; 7139 if (Width.isVector()) { 7140 InstWidening Decision = getWideningDecision(I, Width); 7141 assert(Decision != CM_Unknown && 7142 "CM decision should be taken at this point"); 7143 if (Decision == CM_Scalarize) 7144 Width = ElementCount::getFixed(1); 7145 } 7146 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7147 return getMemoryInstructionCost(I, VF); 7148 } 7149 case Instruction::ZExt: 7150 case Instruction::SExt: 7151 case Instruction::FPToUI: 7152 case Instruction::FPToSI: 7153 case Instruction::FPExt: 7154 case Instruction::PtrToInt: 7155 case Instruction::IntToPtr: 7156 case Instruction::SIToFP: 7157 case Instruction::UIToFP: 7158 case Instruction::Trunc: 7159 case Instruction::FPTrunc: 7160 case Instruction::BitCast: { 7161 // Computes the CastContextHint from a Load/Store instruction. 7162 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7163 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7164 "Expected a load or a store!"); 7165 7166 if (VF.isScalar() || !TheLoop->contains(I)) 7167 return TTI::CastContextHint::Normal; 7168 7169 switch (getWideningDecision(I, VF)) { 7170 case LoopVectorizationCostModel::CM_GatherScatter: 7171 return TTI::CastContextHint::GatherScatter; 7172 case LoopVectorizationCostModel::CM_Interleave: 7173 return TTI::CastContextHint::Interleave; 7174 case LoopVectorizationCostModel::CM_Scalarize: 7175 case LoopVectorizationCostModel::CM_Widen: 7176 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7177 : TTI::CastContextHint::Normal; 7178 case LoopVectorizationCostModel::CM_Widen_Reverse: 7179 return TTI::CastContextHint::Reversed; 7180 case LoopVectorizationCostModel::CM_Unknown: 7181 llvm_unreachable("Instr did not go through cost modelling?"); 7182 } 7183 7184 llvm_unreachable("Unhandled case!"); 7185 }; 7186 7187 unsigned Opcode = I->getOpcode(); 7188 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7189 // For Trunc, the context is the only user, which must be a StoreInst. 7190 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7191 if (I->hasOneUse()) 7192 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7193 CCH = ComputeCCH(Store); 7194 } 7195 // For Z/Sext, the context is the operand, which must be a LoadInst. 7196 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7197 Opcode == Instruction::FPExt) { 7198 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7199 CCH = ComputeCCH(Load); 7200 } 7201 7202 // We optimize the truncation of induction variables having constant 7203 // integer steps. The cost of these truncations is the same as the scalar 7204 // operation. 7205 if (isOptimizableIVTruncate(I, VF)) { 7206 auto *Trunc = cast<TruncInst>(I); 7207 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7208 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7209 } 7210 7211 Type *SrcScalarTy = I->getOperand(0)->getType(); 7212 Type *SrcVecTy = 7213 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7214 if (canTruncateToMinimalBitwidth(I, VF)) { 7215 // This cast is going to be shrunk. This may remove the cast or it might 7216 // turn it into slightly different cast. For example, if MinBW == 16, 7217 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7218 // 7219 // Calculate the modified src and dest types. 7220 Type *MinVecTy = VectorTy; 7221 if (Opcode == Instruction::Trunc) { 7222 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7223 VectorTy = 7224 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7225 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7226 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7227 VectorTy = 7228 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7229 } 7230 } 7231 7232 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7233 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7234 return N * 7235 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7236 } 7237 case Instruction::Call: { 7238 bool NeedToScalarize; 7239 CallInst *CI = cast<CallInst>(I); 7240 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7241 if (getVectorIntrinsicIDForCall(CI, TLI)) 7242 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7243 return CallCost; 7244 } 7245 case Instruction::ExtractValue: { 7246 InstructionCost ExtractCost = 7247 TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7248 assert(ExtractCost.isValid() && "Invalid cost for ExtractValue"); 7249 return *(ExtractCost.getValue()); 7250 } 7251 default: 7252 // The cost of executing VF copies of the scalar instruction. This opcode 7253 // is unknown. Assume that it is the same as 'mul'. 7254 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7255 Instruction::Mul, VectorTy, CostKind) + 7256 getScalarizationOverhead(I, VF); 7257 } // end of switch. 7258 } 7259 7260 char LoopVectorize::ID = 0; 7261 7262 static const char lv_name[] = "Loop Vectorization"; 7263 7264 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7265 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7266 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7267 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7268 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7269 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7270 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7271 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7272 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7273 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7274 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7275 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7276 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7277 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7278 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7279 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7280 7281 namespace llvm { 7282 7283 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7284 7285 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7286 bool VectorizeOnlyWhenForced) { 7287 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7288 } 7289 7290 } // end namespace llvm 7291 7292 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7293 // Check if the pointer operand of a load or store instruction is 7294 // consecutive. 7295 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7296 return Legal->isConsecutivePtr(Ptr); 7297 return false; 7298 } 7299 7300 void LoopVectorizationCostModel::collectValuesToIgnore() { 7301 // Ignore ephemeral values. 7302 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7303 7304 // Ignore type-promoting instructions we identified during reduction 7305 // detection. 7306 for (auto &Reduction : Legal->getReductionVars()) { 7307 RecurrenceDescriptor &RedDes = Reduction.second; 7308 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7309 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7310 } 7311 // Ignore type-casting instructions we identified during induction 7312 // detection. 7313 for (auto &Induction : Legal->getInductionVars()) { 7314 InductionDescriptor &IndDes = Induction.second; 7315 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7316 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7317 } 7318 } 7319 7320 void LoopVectorizationCostModel::collectInLoopReductions() { 7321 for (auto &Reduction : Legal->getReductionVars()) { 7322 PHINode *Phi = Reduction.first; 7323 RecurrenceDescriptor &RdxDesc = Reduction.second; 7324 7325 // We don't collect reductions that are type promoted (yet). 7326 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7327 continue; 7328 7329 // If the target would prefer this reduction to happen "in-loop", then we 7330 // want to record it as such. 7331 unsigned Opcode = RdxDesc.getOpcode(); 7332 if (!PreferInLoopReductions && 7333 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7334 TargetTransformInfo::ReductionFlags())) 7335 continue; 7336 7337 // Check that we can correctly put the reductions into the loop, by 7338 // finding the chain of operations that leads from the phi to the loop 7339 // exit value. 7340 SmallVector<Instruction *, 4> ReductionOperations = 7341 RdxDesc.getReductionOpChain(Phi, TheLoop); 7342 bool InLoop = !ReductionOperations.empty(); 7343 if (InLoop) 7344 InLoopReductionChains[Phi] = ReductionOperations; 7345 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7346 << " reduction for phi: " << *Phi << "\n"); 7347 } 7348 } 7349 7350 // TODO: we could return a pair of values that specify the max VF and 7351 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7352 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7353 // doesn't have a cost model that can choose which plan to execute if 7354 // more than one is generated. 7355 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7356 LoopVectorizationCostModel &CM) { 7357 unsigned WidestType; 7358 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7359 return WidestVectorRegBits / WidestType; 7360 } 7361 7362 VectorizationFactor 7363 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7364 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7365 ElementCount VF = UserVF; 7366 // Outer loop handling: They may require CFG and instruction level 7367 // transformations before even evaluating whether vectorization is profitable. 7368 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7369 // the vectorization pipeline. 7370 if (!OrigLoop->isInnermost()) { 7371 // If the user doesn't provide a vectorization factor, determine a 7372 // reasonable one. 7373 if (UserVF.isZero()) { 7374 VF = ElementCount::getFixed( 7375 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7376 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7377 7378 // Make sure we have a VF > 1 for stress testing. 7379 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7380 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7381 << "overriding computed VF.\n"); 7382 VF = ElementCount::getFixed(4); 7383 } 7384 } 7385 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7386 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7387 "VF needs to be a power of two"); 7388 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7389 << "VF " << VF << " to build VPlans.\n"); 7390 buildVPlans(VF, VF); 7391 7392 // For VPlan build stress testing, we bail out after VPlan construction. 7393 if (VPlanBuildStressTest) 7394 return VectorizationFactor::Disabled(); 7395 7396 return {VF, 0 /*Cost*/}; 7397 } 7398 7399 LLVM_DEBUG( 7400 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7401 "VPlan-native path.\n"); 7402 return VectorizationFactor::Disabled(); 7403 } 7404 7405 Optional<VectorizationFactor> 7406 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7407 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7408 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7409 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7410 return None; 7411 7412 // Invalidate interleave groups if all blocks of loop will be predicated. 7413 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7414 !useMaskedInterleavedAccesses(*TTI)) { 7415 LLVM_DEBUG( 7416 dbgs() 7417 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7418 "which requires masked-interleaved support.\n"); 7419 if (CM.InterleaveInfo.invalidateGroups()) 7420 // Invalidating interleave groups also requires invalidating all decisions 7421 // based on them, which includes widening decisions and uniform and scalar 7422 // values. 7423 CM.invalidateCostModelingDecisions(); 7424 } 7425 7426 ElementCount MaxVF = MaybeMaxVF.getValue(); 7427 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7428 7429 if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { 7430 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7431 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7432 "VF needs to be a power of two"); 7433 // Collect the instructions (and their associated costs) that will be more 7434 // profitable to scalarize. 7435 CM.selectUserVectorizationFactor(UserVF); 7436 CM.collectInLoopReductions(); 7437 buildVPlansWithVPRecipes(UserVF, UserVF); 7438 LLVM_DEBUG(printPlans(dbgs())); 7439 return {{UserVF, 0}}; 7440 } 7441 7442 assert(!MaxVF.isScalable() && 7443 "Scalable vectors not yet supported beyond this point"); 7444 7445 for (ElementCount VF = ElementCount::getFixed(1); 7446 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7447 // Collect Uniform and Scalar instructions after vectorization with VF. 7448 CM.collectUniformsAndScalars(VF); 7449 7450 // Collect the instructions (and their associated costs) that will be more 7451 // profitable to scalarize. 7452 if (VF.isVector()) 7453 CM.collectInstsToScalarize(VF); 7454 } 7455 7456 CM.collectInLoopReductions(); 7457 7458 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7459 LLVM_DEBUG(printPlans(dbgs())); 7460 if (MaxVF.isScalar()) 7461 return VectorizationFactor::Disabled(); 7462 7463 // Select the optimal vectorization factor. 7464 return CM.selectVectorizationFactor(MaxVF); 7465 } 7466 7467 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7468 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7469 << '\n'); 7470 BestVF = VF; 7471 BestUF = UF; 7472 7473 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7474 return !Plan->hasVF(VF); 7475 }); 7476 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7477 } 7478 7479 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7480 DominatorTree *DT) { 7481 // Perform the actual loop transformation. 7482 7483 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7484 VPCallbackILV CallbackILV(ILV); 7485 7486 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7487 7488 VPTransformState State{*BestVF, BestUF, LI, 7489 DT, ILV.Builder, ILV.VectorLoopValueMap, 7490 &ILV, CallbackILV}; 7491 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7492 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7493 State.CanonicalIV = ILV.Induction; 7494 7495 ILV.printDebugTracesAtStart(); 7496 7497 //===------------------------------------------------===// 7498 // 7499 // Notice: any optimization or new instruction that go 7500 // into the code below should also be implemented in 7501 // the cost-model. 7502 // 7503 //===------------------------------------------------===// 7504 7505 // 2. Copy and widen instructions from the old loop into the new loop. 7506 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7507 VPlans.front()->execute(&State); 7508 7509 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7510 // predication, updating analyses. 7511 ILV.fixVectorizedLoop(); 7512 7513 ILV.printDebugTracesAtEnd(); 7514 } 7515 7516 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7517 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7518 7519 // We create new control-flow for the vectorized loop, so the original exit 7520 // conditions will be dead after vectorization if it's only used by the 7521 // terminator 7522 SmallVector<BasicBlock*> ExitingBlocks; 7523 OrigLoop->getExitingBlocks(ExitingBlocks); 7524 for (auto *BB : ExitingBlocks) { 7525 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7526 if (!Cmp || !Cmp->hasOneUse()) 7527 continue; 7528 7529 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7530 if (!DeadInstructions.insert(Cmp).second) 7531 continue; 7532 7533 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7534 // TODO: can recurse through operands in general 7535 for (Value *Op : Cmp->operands()) { 7536 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7537 DeadInstructions.insert(cast<Instruction>(Op)); 7538 } 7539 } 7540 7541 // We create new "steps" for induction variable updates to which the original 7542 // induction variables map. An original update instruction will be dead if 7543 // all its users except the induction variable are dead. 7544 auto *Latch = OrigLoop->getLoopLatch(); 7545 for (auto &Induction : Legal->getInductionVars()) { 7546 PHINode *Ind = Induction.first; 7547 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7548 7549 // If the tail is to be folded by masking, the primary induction variable, 7550 // if exists, isn't dead: it will be used for masking. Don't kill it. 7551 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7552 continue; 7553 7554 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7555 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7556 })) 7557 DeadInstructions.insert(IndUpdate); 7558 7559 // We record as "Dead" also the type-casting instructions we had identified 7560 // during induction analysis. We don't need any handling for them in the 7561 // vectorized loop because we have proven that, under a proper runtime 7562 // test guarding the vectorized loop, the value of the phi, and the casted 7563 // value of the phi, are the same. The last instruction in this casting chain 7564 // will get its scalar/vector/widened def from the scalar/vector/widened def 7565 // of the respective phi node. Any other casts in the induction def-use chain 7566 // have no other uses outside the phi update chain, and will be ignored. 7567 InductionDescriptor &IndDes = Induction.second; 7568 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7569 DeadInstructions.insert(Casts.begin(), Casts.end()); 7570 } 7571 } 7572 7573 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7574 7575 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7576 7577 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7578 Instruction::BinaryOps BinOp) { 7579 // When unrolling and the VF is 1, we only need to add a simple scalar. 7580 Type *Ty = Val->getType(); 7581 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7582 7583 if (Ty->isFloatingPointTy()) { 7584 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7585 7586 // Floating point operations had to be 'fast' to enable the unrolling. 7587 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7588 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7589 } 7590 Constant *C = ConstantInt::get(Ty, StartIdx); 7591 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7592 } 7593 7594 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7595 SmallVector<Metadata *, 4> MDs; 7596 // Reserve first location for self reference to the LoopID metadata node. 7597 MDs.push_back(nullptr); 7598 bool IsUnrollMetadata = false; 7599 MDNode *LoopID = L->getLoopID(); 7600 if (LoopID) { 7601 // First find existing loop unrolling disable metadata. 7602 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7603 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7604 if (MD) { 7605 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7606 IsUnrollMetadata = 7607 S && S->getString().startswith("llvm.loop.unroll.disable"); 7608 } 7609 MDs.push_back(LoopID->getOperand(i)); 7610 } 7611 } 7612 7613 if (!IsUnrollMetadata) { 7614 // Add runtime unroll disable metadata. 7615 LLVMContext &Context = L->getHeader()->getContext(); 7616 SmallVector<Metadata *, 1> DisableOperands; 7617 DisableOperands.push_back( 7618 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7619 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7620 MDs.push_back(DisableNode); 7621 MDNode *NewLoopID = MDNode::get(Context, MDs); 7622 // Set operand 0 to refer to the loop id itself. 7623 NewLoopID->replaceOperandWith(0, NewLoopID); 7624 L->setLoopID(NewLoopID); 7625 } 7626 } 7627 7628 //===--------------------------------------------------------------------===// 7629 // EpilogueVectorizerMainLoop 7630 //===--------------------------------------------------------------------===// 7631 7632 /// This function is partially responsible for generating the control flow 7633 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7634 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7635 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7636 Loop *Lp = createVectorLoopSkeleton(""); 7637 7638 // Generate the code to check the minimum iteration count of the vector 7639 // epilogue (see below). 7640 EPI.EpilogueIterationCountCheck = 7641 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7642 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7643 7644 // Generate the code to check any assumptions that we've made for SCEV 7645 // expressions. 7646 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7647 emitSCEVChecks(Lp, LoopScalarPreHeader); 7648 7649 // If a safety check was generated save it. 7650 if (SavedPreHeader != LoopVectorPreHeader) 7651 EPI.SCEVSafetyCheck = SavedPreHeader; 7652 7653 // Generate the code that checks at runtime if arrays overlap. We put the 7654 // checks into a separate block to make the more common case of few elements 7655 // faster. 7656 SavedPreHeader = LoopVectorPreHeader; 7657 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7658 7659 // If a safety check was generated save/overwite it. 7660 if (SavedPreHeader != LoopVectorPreHeader) 7661 EPI.MemSafetyCheck = SavedPreHeader; 7662 7663 // Generate the iteration count check for the main loop, *after* the check 7664 // for the epilogue loop, so that the path-length is shorter for the case 7665 // that goes directly through the vector epilogue. The longer-path length for 7666 // the main loop is compensated for, by the gain from vectorizing the larger 7667 // trip count. Note: the branch will get updated later on when we vectorize 7668 // the epilogue. 7669 EPI.MainLoopIterationCountCheck = 7670 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7671 7672 // Generate the induction variable. 7673 OldInduction = Legal->getPrimaryInduction(); 7674 Type *IdxTy = Legal->getWidestInductionType(); 7675 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7676 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7677 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7678 EPI.VectorTripCount = CountRoundDown; 7679 Induction = 7680 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7681 getDebugLocFromInstOrOperands(OldInduction)); 7682 7683 // Skip induction resume value creation here because they will be created in 7684 // the second pass. If we created them here, they wouldn't be used anyway, 7685 // because the vplan in the second pass still contains the inductions from the 7686 // original loop. 7687 7688 return completeLoopSkeleton(Lp, OrigLoopID); 7689 } 7690 7691 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7692 LLVM_DEBUG({ 7693 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7694 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7695 << ", Main Loop UF:" << EPI.MainLoopUF 7696 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7697 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7698 }); 7699 } 7700 7701 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7702 DEBUG_WITH_TYPE(VerboseDebug, { 7703 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7704 }); 7705 } 7706 7707 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7708 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7709 assert(L && "Expected valid Loop."); 7710 assert(Bypass && "Expected valid bypass basic block."); 7711 unsigned VFactor = 7712 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7713 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7714 Value *Count = getOrCreateTripCount(L); 7715 // Reuse existing vector loop preheader for TC checks. 7716 // Note that new preheader block is generated for vector loop. 7717 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7718 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7719 7720 // Generate code to check if the loop's trip count is less than VF * UF of the 7721 // main vector loop. 7722 auto P = 7723 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7724 7725 Value *CheckMinIters = Builder.CreateICmp( 7726 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7727 "min.iters.check"); 7728 7729 if (!ForEpilogue) 7730 TCCheckBlock->setName("vector.main.loop.iter.check"); 7731 7732 // Create new preheader for vector loop. 7733 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7734 DT, LI, nullptr, "vector.ph"); 7735 7736 if (ForEpilogue) { 7737 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7738 DT->getNode(Bypass)->getIDom()) && 7739 "TC check is expected to dominate Bypass"); 7740 7741 // Update dominator for Bypass & LoopExit. 7742 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7743 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7744 7745 LoopBypassBlocks.push_back(TCCheckBlock); 7746 7747 // Save the trip count so we don't have to regenerate it in the 7748 // vec.epilog.iter.check. This is safe to do because the trip count 7749 // generated here dominates the vector epilog iter check. 7750 EPI.TripCount = Count; 7751 } 7752 7753 ReplaceInstWithInst( 7754 TCCheckBlock->getTerminator(), 7755 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7756 7757 return TCCheckBlock; 7758 } 7759 7760 //===--------------------------------------------------------------------===// 7761 // EpilogueVectorizerEpilogueLoop 7762 //===--------------------------------------------------------------------===// 7763 7764 /// This function is partially responsible for generating the control flow 7765 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7766 BasicBlock * 7767 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7768 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7769 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7770 7771 // Now, compare the remaining count and if there aren't enough iterations to 7772 // execute the vectorized epilogue skip to the scalar part. 7773 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7774 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7775 LoopVectorPreHeader = 7776 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7777 LI, nullptr, "vec.epilog.ph"); 7778 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7779 VecEpilogueIterationCountCheck); 7780 7781 // Adjust the control flow taking the state info from the main loop 7782 // vectorization into account. 7783 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7784 "expected this to be saved from the previous pass."); 7785 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7786 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7787 7788 DT->changeImmediateDominator(LoopVectorPreHeader, 7789 EPI.MainLoopIterationCountCheck); 7790 7791 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7792 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7793 7794 if (EPI.SCEVSafetyCheck) 7795 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7796 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7797 if (EPI.MemSafetyCheck) 7798 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7799 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7800 7801 DT->changeImmediateDominator( 7802 VecEpilogueIterationCountCheck, 7803 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7804 7805 DT->changeImmediateDominator(LoopScalarPreHeader, 7806 EPI.EpilogueIterationCountCheck); 7807 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7808 7809 // Keep track of bypass blocks, as they feed start values to the induction 7810 // phis in the scalar loop preheader. 7811 if (EPI.SCEVSafetyCheck) 7812 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7813 if (EPI.MemSafetyCheck) 7814 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7815 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7816 7817 // Generate a resume induction for the vector epilogue and put it in the 7818 // vector epilogue preheader 7819 Type *IdxTy = Legal->getWidestInductionType(); 7820 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7821 LoopVectorPreHeader->getFirstNonPHI()); 7822 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7823 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7824 EPI.MainLoopIterationCountCheck); 7825 7826 // Generate the induction variable. 7827 OldInduction = Legal->getPrimaryInduction(); 7828 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7829 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7830 Value *StartIdx = EPResumeVal; 7831 Induction = 7832 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7833 getDebugLocFromInstOrOperands(OldInduction)); 7834 7835 // Generate induction resume values. These variables save the new starting 7836 // indexes for the scalar loop. They are used to test if there are any tail 7837 // iterations left once the vector loop has completed. 7838 // Note that when the vectorized epilogue is skipped due to iteration count 7839 // check, then the resume value for the induction variable comes from 7840 // the trip count of the main vector loop, hence passing the AdditionalBypass 7841 // argument. 7842 createInductionResumeValues(Lp, CountRoundDown, 7843 {VecEpilogueIterationCountCheck, 7844 EPI.VectorTripCount} /* AdditionalBypass */); 7845 7846 AddRuntimeUnrollDisableMetaData(Lp); 7847 return completeLoopSkeleton(Lp, OrigLoopID); 7848 } 7849 7850 BasicBlock * 7851 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7852 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7853 7854 assert(EPI.TripCount && 7855 "Expected trip count to have been safed in the first pass."); 7856 assert( 7857 (!isa<Instruction>(EPI.TripCount) || 7858 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7859 "saved trip count does not dominate insertion point."); 7860 Value *TC = EPI.TripCount; 7861 IRBuilder<> Builder(Insert->getTerminator()); 7862 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7863 7864 // Generate code to check if the loop's trip count is less than VF * UF of the 7865 // vector epilogue loop. 7866 auto P = 7867 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7868 7869 Value *CheckMinIters = Builder.CreateICmp( 7870 P, Count, 7871 ConstantInt::get(Count->getType(), 7872 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7873 "min.epilog.iters.check"); 7874 7875 ReplaceInstWithInst( 7876 Insert->getTerminator(), 7877 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7878 7879 LoopBypassBlocks.push_back(Insert); 7880 return Insert; 7881 } 7882 7883 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7884 LLVM_DEBUG({ 7885 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7886 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7887 << ", Main Loop UF:" << EPI.MainLoopUF 7888 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7889 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7890 }); 7891 } 7892 7893 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7894 DEBUG_WITH_TYPE(VerboseDebug, { 7895 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7896 }); 7897 } 7898 7899 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7900 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7901 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7902 bool PredicateAtRangeStart = Predicate(Range.Start); 7903 7904 for (ElementCount TmpVF = Range.Start * 2; 7905 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7906 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7907 Range.End = TmpVF; 7908 break; 7909 } 7910 7911 return PredicateAtRangeStart; 7912 } 7913 7914 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7915 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7916 /// of VF's starting at a given VF and extending it as much as possible. Each 7917 /// vectorization decision can potentially shorten this sub-range during 7918 /// buildVPlan(). 7919 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7920 ElementCount MaxVF) { 7921 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7922 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7923 VFRange SubRange = {VF, MaxVFPlusOne}; 7924 VPlans.push_back(buildVPlan(SubRange)); 7925 VF = SubRange.End; 7926 } 7927 } 7928 7929 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7930 VPlanPtr &Plan) { 7931 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7932 7933 // Look for cached value. 7934 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7935 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7936 if (ECEntryIt != EdgeMaskCache.end()) 7937 return ECEntryIt->second; 7938 7939 VPValue *SrcMask = createBlockInMask(Src, Plan); 7940 7941 // The terminator has to be a branch inst! 7942 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7943 assert(BI && "Unexpected terminator found"); 7944 7945 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7946 return EdgeMaskCache[Edge] = SrcMask; 7947 7948 // If source is an exiting block, we know the exit edge is dynamically dead 7949 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7950 // adding uses of an otherwise potentially dead instruction. 7951 if (OrigLoop->isLoopExiting(Src)) 7952 return EdgeMaskCache[Edge] = SrcMask; 7953 7954 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7955 assert(EdgeMask && "No Edge Mask found for condition"); 7956 7957 if (BI->getSuccessor(0) != Dst) 7958 EdgeMask = Builder.createNot(EdgeMask); 7959 7960 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7961 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7962 7963 return EdgeMaskCache[Edge] = EdgeMask; 7964 } 7965 7966 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7967 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7968 7969 // Look for cached value. 7970 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7971 if (BCEntryIt != BlockMaskCache.end()) 7972 return BCEntryIt->second; 7973 7974 // All-one mask is modelled as no-mask following the convention for masked 7975 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7976 VPValue *BlockMask = nullptr; 7977 7978 if (OrigLoop->getHeader() == BB) { 7979 if (!CM.blockNeedsPredication(BB)) 7980 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7981 7982 // Create the block in mask as the first non-phi instruction in the block. 7983 VPBuilder::InsertPointGuard Guard(Builder); 7984 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7985 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7986 7987 // Introduce the early-exit compare IV <= BTC to form header block mask. 7988 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7989 // Start by constructing the desired canonical IV. 7990 VPValue *IV = nullptr; 7991 if (Legal->getPrimaryInduction()) 7992 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7993 else { 7994 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7995 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7996 IV = IVRecipe->getVPValue(); 7997 } 7998 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7999 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8000 8001 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8002 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8003 // as a second argument, we only pass the IV here and extract the 8004 // tripcount from the transform state where codegen of the VP instructions 8005 // happen. 8006 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8007 } else { 8008 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8009 } 8010 return BlockMaskCache[BB] = BlockMask; 8011 } 8012 8013 // This is the block mask. We OR all incoming edges. 8014 for (auto *Predecessor : predecessors(BB)) { 8015 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8016 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8017 return BlockMaskCache[BB] = EdgeMask; 8018 8019 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8020 BlockMask = EdgeMask; 8021 continue; 8022 } 8023 8024 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8025 } 8026 8027 return BlockMaskCache[BB] = BlockMask; 8028 } 8029 8030 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8031 VPlanPtr &Plan) { 8032 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8033 "Must be called with either a load or store"); 8034 8035 auto willWiden = [&](ElementCount VF) -> bool { 8036 if (VF.isScalar()) 8037 return false; 8038 LoopVectorizationCostModel::InstWidening Decision = 8039 CM.getWideningDecision(I, VF); 8040 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8041 "CM decision should be taken at this point."); 8042 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8043 return true; 8044 if (CM.isScalarAfterVectorization(I, VF) || 8045 CM.isProfitableToScalarize(I, VF)) 8046 return false; 8047 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8048 }; 8049 8050 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8051 return nullptr; 8052 8053 VPValue *Mask = nullptr; 8054 if (Legal->isMaskRequired(I)) 8055 Mask = createBlockInMask(I->getParent(), Plan); 8056 8057 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8058 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8059 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8060 8061 StoreInst *Store = cast<StoreInst>(I); 8062 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8063 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8064 } 8065 8066 VPWidenIntOrFpInductionRecipe * 8067 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8068 // Check if this is an integer or fp induction. If so, build the recipe that 8069 // produces its scalar and vector values. 8070 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8071 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8072 II.getKind() == InductionDescriptor::IK_FpInduction) { 8073 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8074 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8075 } 8076 8077 return nullptr; 8078 } 8079 8080 VPWidenIntOrFpInductionRecipe * 8081 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8082 VPlan &Plan) const { 8083 // Optimize the special case where the source is a constant integer 8084 // induction variable. Notice that we can only optimize the 'trunc' case 8085 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8086 // (c) other casts depend on pointer size. 8087 8088 // Determine whether \p K is a truncation based on an induction variable that 8089 // can be optimized. 8090 auto isOptimizableIVTruncate = 8091 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8092 return [=](ElementCount VF) -> bool { 8093 return CM.isOptimizableIVTruncate(K, VF); 8094 }; 8095 }; 8096 8097 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8098 isOptimizableIVTruncate(I), Range)) { 8099 8100 InductionDescriptor II = 8101 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8102 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8103 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8104 Start, I); 8105 } 8106 return nullptr; 8107 } 8108 8109 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8110 // We know that all PHIs in non-header blocks are converted into selects, so 8111 // we don't have to worry about the insertion order and we can just use the 8112 // builder. At this point we generate the predication tree. There may be 8113 // duplications since this is a simple recursive scan, but future 8114 // optimizations will clean it up. 8115 8116 SmallVector<VPValue *, 2> Operands; 8117 unsigned NumIncoming = Phi->getNumIncomingValues(); 8118 for (unsigned In = 0; In < NumIncoming; In++) { 8119 VPValue *EdgeMask = 8120 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8121 assert((EdgeMask || NumIncoming == 1) && 8122 "Multiple predecessors with one having a full mask"); 8123 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8124 if (EdgeMask) 8125 Operands.push_back(EdgeMask); 8126 } 8127 return new VPBlendRecipe(Phi, Operands); 8128 } 8129 8130 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8131 VPlan &Plan) const { 8132 8133 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8134 [this, CI](ElementCount VF) { 8135 return CM.isScalarWithPredication(CI, VF); 8136 }, 8137 Range); 8138 8139 if (IsPredicated) 8140 return nullptr; 8141 8142 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8143 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8144 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8145 ID == Intrinsic::pseudoprobe)) 8146 return nullptr; 8147 8148 auto willWiden = [&](ElementCount VF) -> bool { 8149 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8150 // The following case may be scalarized depending on the VF. 8151 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8152 // version of the instruction. 8153 // Is it beneficial to perform intrinsic call compared to lib call? 8154 bool NeedToScalarize = false; 8155 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8156 bool UseVectorIntrinsic = 8157 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8158 return UseVectorIntrinsic || !NeedToScalarize; 8159 }; 8160 8161 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8162 return nullptr; 8163 8164 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8165 } 8166 8167 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8168 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8169 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8170 // Instruction should be widened, unless it is scalar after vectorization, 8171 // scalarization is profitable or it is predicated. 8172 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8173 return CM.isScalarAfterVectorization(I, VF) || 8174 CM.isProfitableToScalarize(I, VF) || 8175 CM.isScalarWithPredication(I, VF); 8176 }; 8177 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8178 Range); 8179 } 8180 8181 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8182 auto IsVectorizableOpcode = [](unsigned Opcode) { 8183 switch (Opcode) { 8184 case Instruction::Add: 8185 case Instruction::And: 8186 case Instruction::AShr: 8187 case Instruction::BitCast: 8188 case Instruction::FAdd: 8189 case Instruction::FCmp: 8190 case Instruction::FDiv: 8191 case Instruction::FMul: 8192 case Instruction::FNeg: 8193 case Instruction::FPExt: 8194 case Instruction::FPToSI: 8195 case Instruction::FPToUI: 8196 case Instruction::FPTrunc: 8197 case Instruction::FRem: 8198 case Instruction::FSub: 8199 case Instruction::ICmp: 8200 case Instruction::IntToPtr: 8201 case Instruction::LShr: 8202 case Instruction::Mul: 8203 case Instruction::Or: 8204 case Instruction::PtrToInt: 8205 case Instruction::SDiv: 8206 case Instruction::Select: 8207 case Instruction::SExt: 8208 case Instruction::Shl: 8209 case Instruction::SIToFP: 8210 case Instruction::SRem: 8211 case Instruction::Sub: 8212 case Instruction::Trunc: 8213 case Instruction::UDiv: 8214 case Instruction::UIToFP: 8215 case Instruction::URem: 8216 case Instruction::Xor: 8217 case Instruction::ZExt: 8218 return true; 8219 } 8220 return false; 8221 }; 8222 8223 if (!IsVectorizableOpcode(I->getOpcode())) 8224 return nullptr; 8225 8226 // Success: widen this instruction. 8227 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8228 } 8229 8230 VPBasicBlock *VPRecipeBuilder::handleReplication( 8231 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8232 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8233 VPlanPtr &Plan) { 8234 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8235 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8236 Range); 8237 8238 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8239 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8240 Range); 8241 8242 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8243 IsUniform, IsPredicated); 8244 setRecipe(I, Recipe); 8245 Plan->addVPValue(I, Recipe); 8246 8247 // Find if I uses a predicated instruction. If so, it will use its scalar 8248 // value. Avoid hoisting the insert-element which packs the scalar value into 8249 // a vector value, as that happens iff all users use the vector value. 8250 for (auto &Op : I->operands()) 8251 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8252 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8253 PredInst2Recipe[PredInst]->setAlsoPack(false); 8254 8255 // Finalize the recipe for Instr, first if it is not predicated. 8256 if (!IsPredicated) { 8257 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8258 VPBB->appendRecipe(Recipe); 8259 return VPBB; 8260 } 8261 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8262 assert(VPBB->getSuccessors().empty() && 8263 "VPBB has successors when handling predicated replication."); 8264 // Record predicated instructions for above packing optimizations. 8265 PredInst2Recipe[I] = Recipe; 8266 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8267 VPBlockUtils::insertBlockAfter(Region, VPBB); 8268 auto *RegSucc = new VPBasicBlock(); 8269 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8270 return RegSucc; 8271 } 8272 8273 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8274 VPRecipeBase *PredRecipe, 8275 VPlanPtr &Plan) { 8276 // Instructions marked for predication are replicated and placed under an 8277 // if-then construct to prevent side-effects. 8278 8279 // Generate recipes to compute the block mask for this region. 8280 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8281 8282 // Build the triangular if-then region. 8283 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8284 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8285 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8286 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8287 auto *PHIRecipe = Instr->getType()->isVoidTy() 8288 ? nullptr 8289 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8290 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8291 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8292 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8293 8294 // Note: first set Entry as region entry and then connect successors starting 8295 // from it in order, to propagate the "parent" of each VPBasicBlock. 8296 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8297 VPBlockUtils::connectBlocks(Pred, Exit); 8298 8299 return Region; 8300 } 8301 8302 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8303 VFRange &Range, 8304 VPlanPtr &Plan) { 8305 // First, check for specific widening recipes that deal with calls, memory 8306 // operations, inductions and Phi nodes. 8307 if (auto *CI = dyn_cast<CallInst>(Instr)) 8308 return tryToWidenCall(CI, Range, *Plan); 8309 8310 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8311 return tryToWidenMemory(Instr, Range, Plan); 8312 8313 VPRecipeBase *Recipe; 8314 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8315 if (Phi->getParent() != OrigLoop->getHeader()) 8316 return tryToBlend(Phi, Plan); 8317 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8318 return Recipe; 8319 return new VPWidenPHIRecipe(Phi); 8320 } 8321 8322 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8323 cast<TruncInst>(Instr), Range, *Plan))) 8324 return Recipe; 8325 8326 if (!shouldWiden(Instr, Range)) 8327 return nullptr; 8328 8329 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8330 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8331 OrigLoop); 8332 8333 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8334 bool InvariantCond = 8335 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8336 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8337 InvariantCond); 8338 } 8339 8340 return tryToWiden(Instr, *Plan); 8341 } 8342 8343 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8344 ElementCount MaxVF) { 8345 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8346 8347 // Collect instructions from the original loop that will become trivially dead 8348 // in the vectorized loop. We don't need to vectorize these instructions. For 8349 // example, original induction update instructions can become dead because we 8350 // separately emit induction "steps" when generating code for the new loop. 8351 // Similarly, we create a new latch condition when setting up the structure 8352 // of the new loop, so the old one can become dead. 8353 SmallPtrSet<Instruction *, 4> DeadInstructions; 8354 collectTriviallyDeadInstructions(DeadInstructions); 8355 8356 // Add assume instructions we need to drop to DeadInstructions, to prevent 8357 // them from being added to the VPlan. 8358 // TODO: We only need to drop assumes in blocks that get flattend. If the 8359 // control flow is preserved, we should keep them. 8360 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8361 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8362 8363 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8364 // Dead instructions do not need sinking. Remove them from SinkAfter. 8365 for (Instruction *I : DeadInstructions) 8366 SinkAfter.erase(I); 8367 8368 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8369 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8370 VFRange SubRange = {VF, MaxVFPlusOne}; 8371 VPlans.push_back( 8372 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8373 VF = SubRange.End; 8374 } 8375 } 8376 8377 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8378 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8379 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8380 8381 // Hold a mapping from predicated instructions to their recipes, in order to 8382 // fix their AlsoPack behavior if a user is determined to replicate and use a 8383 // scalar instead of vector value. 8384 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8385 8386 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8387 8388 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8389 8390 // --------------------------------------------------------------------------- 8391 // Pre-construction: record ingredients whose recipes we'll need to further 8392 // process after constructing the initial VPlan. 8393 // --------------------------------------------------------------------------- 8394 8395 // Mark instructions we'll need to sink later and their targets as 8396 // ingredients whose recipe we'll need to record. 8397 for (auto &Entry : SinkAfter) { 8398 RecipeBuilder.recordRecipeOf(Entry.first); 8399 RecipeBuilder.recordRecipeOf(Entry.second); 8400 } 8401 for (auto &Reduction : CM.getInLoopReductionChains()) { 8402 PHINode *Phi = Reduction.first; 8403 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8404 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8405 8406 RecipeBuilder.recordRecipeOf(Phi); 8407 for (auto &R : ReductionOperations) { 8408 RecipeBuilder.recordRecipeOf(R); 8409 // For min/max reducitons, where we have a pair of icmp/select, we also 8410 // need to record the ICmp recipe, so it can be removed later. 8411 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8412 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8413 } 8414 } 8415 8416 // For each interleave group which is relevant for this (possibly trimmed) 8417 // Range, add it to the set of groups to be later applied to the VPlan and add 8418 // placeholders for its members' Recipes which we'll be replacing with a 8419 // single VPInterleaveRecipe. 8420 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8421 auto applyIG = [IG, this](ElementCount VF) -> bool { 8422 return (VF.isVector() && // Query is illegal for VF == 1 8423 CM.getWideningDecision(IG->getInsertPos(), VF) == 8424 LoopVectorizationCostModel::CM_Interleave); 8425 }; 8426 if (!getDecisionAndClampRange(applyIG, Range)) 8427 continue; 8428 InterleaveGroups.insert(IG); 8429 for (unsigned i = 0; i < IG->getFactor(); i++) 8430 if (Instruction *Member = IG->getMember(i)) 8431 RecipeBuilder.recordRecipeOf(Member); 8432 }; 8433 8434 // --------------------------------------------------------------------------- 8435 // Build initial VPlan: Scan the body of the loop in a topological order to 8436 // visit each basic block after having visited its predecessor basic blocks. 8437 // --------------------------------------------------------------------------- 8438 8439 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8440 auto Plan = std::make_unique<VPlan>(); 8441 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8442 Plan->setEntry(VPBB); 8443 8444 // Scan the body of the loop in a topological order to visit each basic block 8445 // after having visited its predecessor basic blocks. 8446 LoopBlocksDFS DFS(OrigLoop); 8447 DFS.perform(LI); 8448 8449 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8450 // Relevant instructions from basic block BB will be grouped into VPRecipe 8451 // ingredients and fill a new VPBasicBlock. 8452 unsigned VPBBsForBB = 0; 8453 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8454 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8455 VPBB = FirstVPBBForBB; 8456 Builder.setInsertPoint(VPBB); 8457 8458 // Introduce each ingredient into VPlan. 8459 // TODO: Model and preserve debug instrinsics in VPlan. 8460 for (Instruction &I : BB->instructionsWithoutDebug()) { 8461 Instruction *Instr = &I; 8462 8463 // First filter out irrelevant instructions, to ensure no recipes are 8464 // built for them. 8465 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8466 continue; 8467 8468 if (auto Recipe = 8469 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8470 for (auto *Def : Recipe->definedValues()) { 8471 auto *UV = Def->getUnderlyingValue(); 8472 Plan->addVPValue(UV, Def); 8473 } 8474 8475 RecipeBuilder.setRecipe(Instr, Recipe); 8476 VPBB->appendRecipe(Recipe); 8477 continue; 8478 } 8479 8480 // Otherwise, if all widening options failed, Instruction is to be 8481 // replicated. This may create a successor for VPBB. 8482 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8483 Instr, Range, VPBB, PredInst2Recipe, Plan); 8484 if (NextVPBB != VPBB) { 8485 VPBB = NextVPBB; 8486 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8487 : ""); 8488 } 8489 } 8490 } 8491 8492 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8493 // may also be empty, such as the last one VPBB, reflecting original 8494 // basic-blocks with no recipes. 8495 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8496 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8497 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8498 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8499 delete PreEntry; 8500 8501 // --------------------------------------------------------------------------- 8502 // Transform initial VPlan: Apply previously taken decisions, in order, to 8503 // bring the VPlan to its final state. 8504 // --------------------------------------------------------------------------- 8505 8506 // Apply Sink-After legal constraints. 8507 for (auto &Entry : SinkAfter) { 8508 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8509 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8510 Sink->moveAfter(Target); 8511 } 8512 8513 // Interleave memory: for each Interleave Group we marked earlier as relevant 8514 // for this VPlan, replace the Recipes widening its memory instructions with a 8515 // single VPInterleaveRecipe at its insertion point. 8516 for (auto IG : InterleaveGroups) { 8517 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8518 RecipeBuilder.getRecipe(IG->getInsertPos())); 8519 SmallVector<VPValue *, 4> StoredValues; 8520 for (unsigned i = 0; i < IG->getFactor(); ++i) 8521 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8522 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8523 8524 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8525 Recipe->getMask()); 8526 VPIG->insertBefore(Recipe); 8527 unsigned J = 0; 8528 for (unsigned i = 0; i < IG->getFactor(); ++i) 8529 if (Instruction *Member = IG->getMember(i)) { 8530 if (!Member->getType()->isVoidTy()) { 8531 VPValue *OriginalV = Plan->getVPValue(Member); 8532 Plan->removeVPValueFor(Member); 8533 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8534 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8535 J++; 8536 } 8537 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8538 } 8539 } 8540 8541 // Adjust the recipes for any inloop reductions. 8542 if (Range.Start.isVector()) 8543 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8544 8545 // Finally, if tail is folded by masking, introduce selects between the phi 8546 // and the live-out instruction of each reduction, at the end of the latch. 8547 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8548 Builder.setInsertPoint(VPBB); 8549 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8550 for (auto &Reduction : Legal->getReductionVars()) { 8551 if (CM.isInLoopReduction(Reduction.first)) 8552 continue; 8553 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8554 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8555 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8556 } 8557 } 8558 8559 std::string PlanName; 8560 raw_string_ostream RSO(PlanName); 8561 ElementCount VF = Range.Start; 8562 Plan->addVF(VF); 8563 RSO << "Initial VPlan for VF={" << VF; 8564 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8565 Plan->addVF(VF); 8566 RSO << "," << VF; 8567 } 8568 RSO << "},UF>=1"; 8569 RSO.flush(); 8570 Plan->setName(PlanName); 8571 8572 return Plan; 8573 } 8574 8575 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8576 // Outer loop handling: They may require CFG and instruction level 8577 // transformations before even evaluating whether vectorization is profitable. 8578 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8579 // the vectorization pipeline. 8580 assert(!OrigLoop->isInnermost()); 8581 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8582 8583 // Create new empty VPlan 8584 auto Plan = std::make_unique<VPlan>(); 8585 8586 // Build hierarchical CFG 8587 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8588 HCFGBuilder.buildHierarchicalCFG(); 8589 8590 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8591 VF *= 2) 8592 Plan->addVF(VF); 8593 8594 if (EnableVPlanPredication) { 8595 VPlanPredicator VPP(*Plan); 8596 VPP.predicate(); 8597 8598 // Avoid running transformation to recipes until masked code generation in 8599 // VPlan-native path is in place. 8600 return Plan; 8601 } 8602 8603 SmallPtrSet<Instruction *, 1> DeadInstructions; 8604 VPlanTransforms::VPInstructionsToVPRecipes( 8605 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8606 return Plan; 8607 } 8608 8609 // Adjust the recipes for any inloop reductions. The chain of instructions 8610 // leading from the loop exit instr to the phi need to be converted to 8611 // reductions, with one operand being vector and the other being the scalar 8612 // reduction chain. 8613 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8614 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8615 for (auto &Reduction : CM.getInLoopReductionChains()) { 8616 PHINode *Phi = Reduction.first; 8617 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8618 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8619 8620 // ReductionOperations are orders top-down from the phi's use to the 8621 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8622 // which of the two operands will remain scalar and which will be reduced. 8623 // For minmax the chain will be the select instructions. 8624 Instruction *Chain = Phi; 8625 for (Instruction *R : ReductionOperations) { 8626 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8627 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8628 8629 VPValue *ChainOp = Plan->getVPValue(Chain); 8630 unsigned FirstOpId; 8631 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8632 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8633 "Expected to replace a VPWidenSelectSC"); 8634 FirstOpId = 1; 8635 } else { 8636 assert(isa<VPWidenRecipe>(WidenRecipe) && 8637 "Expected to replace a VPWidenSC"); 8638 FirstOpId = 0; 8639 } 8640 unsigned VecOpId = 8641 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8642 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8643 8644 auto *CondOp = CM.foldTailByMasking() 8645 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8646 : nullptr; 8647 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8648 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8649 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8650 Plan->removeVPValueFor(R); 8651 Plan->addVPValue(R, RedRecipe); 8652 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8653 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8654 WidenRecipe->eraseFromParent(); 8655 8656 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8657 VPRecipeBase *CompareRecipe = 8658 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8659 assert(isa<VPWidenRecipe>(CompareRecipe) && 8660 "Expected to replace a VPWidenSC"); 8661 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8662 "Expected no remaining users"); 8663 CompareRecipe->eraseFromParent(); 8664 } 8665 Chain = R; 8666 } 8667 } 8668 } 8669 8670 Value* LoopVectorizationPlanner::VPCallbackILV:: 8671 getOrCreateVectorValues(Value *V, unsigned Part) { 8672 return ILV.getOrCreateVectorValue(V, Part); 8673 } 8674 8675 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8676 Value *V, const VPIteration &Instance) { 8677 return ILV.getOrCreateScalarValue(V, Instance); 8678 } 8679 8680 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8681 VPSlotTracker &SlotTracker) const { 8682 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8683 IG->getInsertPos()->printAsOperand(O, false); 8684 O << ", "; 8685 getAddr()->printAsOperand(O, SlotTracker); 8686 VPValue *Mask = getMask(); 8687 if (Mask) { 8688 O << ", "; 8689 Mask->printAsOperand(O, SlotTracker); 8690 } 8691 for (unsigned i = 0; i < IG->getFactor(); ++i) 8692 if (Instruction *I = IG->getMember(i)) 8693 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8694 } 8695 8696 void VPWidenCallRecipe::execute(VPTransformState &State) { 8697 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8698 *this, State); 8699 } 8700 8701 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8702 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8703 this, *this, InvariantCond, State); 8704 } 8705 8706 void VPWidenRecipe::execute(VPTransformState &State) { 8707 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8708 } 8709 8710 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8711 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8712 *this, State.UF, State.VF, IsPtrLoopInvariant, 8713 IsIndexLoopInvariant, State); 8714 } 8715 8716 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8717 assert(!State.Instance && "Int or FP induction being replicated."); 8718 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8719 Trunc); 8720 } 8721 8722 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8723 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8724 } 8725 8726 void VPBlendRecipe::execute(VPTransformState &State) { 8727 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8728 // We know that all PHIs in non-header blocks are converted into 8729 // selects, so we don't have to worry about the insertion order and we 8730 // can just use the builder. 8731 // At this point we generate the predication tree. There may be 8732 // duplications since this is a simple recursive scan, but future 8733 // optimizations will clean it up. 8734 8735 unsigned NumIncoming = getNumIncomingValues(); 8736 8737 // Generate a sequence of selects of the form: 8738 // SELECT(Mask3, In3, 8739 // SELECT(Mask2, In2, 8740 // SELECT(Mask1, In1, 8741 // In0))) 8742 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8743 // are essentially undef are taken from In0. 8744 InnerLoopVectorizer::VectorParts Entry(State.UF); 8745 for (unsigned In = 0; In < NumIncoming; ++In) { 8746 for (unsigned Part = 0; Part < State.UF; ++Part) { 8747 // We might have single edge PHIs (blocks) - use an identity 8748 // 'select' for the first PHI operand. 8749 Value *In0 = State.get(getIncomingValue(In), Part); 8750 if (In == 0) 8751 Entry[Part] = In0; // Initialize with the first incoming value. 8752 else { 8753 // Select between the current value and the previous incoming edge 8754 // based on the incoming mask. 8755 Value *Cond = State.get(getMask(In), Part); 8756 Entry[Part] = 8757 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8758 } 8759 } 8760 } 8761 for (unsigned Part = 0; Part < State.UF; ++Part) 8762 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8763 } 8764 8765 void VPInterleaveRecipe::execute(VPTransformState &State) { 8766 assert(!State.Instance && "Interleave group being replicated."); 8767 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8768 getStoredValues(), getMask()); 8769 } 8770 8771 void VPReductionRecipe::execute(VPTransformState &State) { 8772 assert(!State.Instance && "Reduction being replicated."); 8773 for (unsigned Part = 0; Part < State.UF; ++Part) { 8774 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8775 Value *NewVecOp = State.get(getVecOp(), Part); 8776 if (VPValue *Cond = getCondOp()) { 8777 Value *NewCond = State.get(Cond, Part); 8778 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8779 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8780 Kind, VecTy->getElementType()); 8781 Constant *IdenVec = 8782 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8783 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8784 NewVecOp = Select; 8785 } 8786 Value *NewRed = 8787 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8788 Value *PrevInChain = State.get(getChainOp(), Part); 8789 Value *NextInChain; 8790 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8791 NextInChain = 8792 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8793 NewRed, PrevInChain); 8794 } else { 8795 NextInChain = State.Builder.CreateBinOp( 8796 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8797 PrevInChain); 8798 } 8799 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8800 } 8801 } 8802 8803 void VPReplicateRecipe::execute(VPTransformState &State) { 8804 if (State.Instance) { // Generate a single instance. 8805 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8806 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8807 *State.Instance, IsPredicated, State); 8808 // Insert scalar instance packing it into a vector. 8809 if (AlsoPack && State.VF.isVector()) { 8810 // If we're constructing lane 0, initialize to start from poison. 8811 if (State.Instance->Lane == 0) { 8812 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8813 Value *Poison = PoisonValue::get( 8814 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8815 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8816 State.Instance->Part, Poison); 8817 } 8818 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8819 *State.Instance); 8820 } 8821 return; 8822 } 8823 8824 // Generate scalar instances for all VF lanes of all UF parts, unless the 8825 // instruction is uniform inwhich case generate only the first lane for each 8826 // of the UF parts. 8827 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8828 assert((!State.VF.isScalable() || IsUniform) && 8829 "Can't scalarize a scalable vector"); 8830 for (unsigned Part = 0; Part < State.UF; ++Part) 8831 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8832 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8833 IsPredicated, State); 8834 } 8835 8836 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8837 assert(State.Instance && "Branch on Mask works only on single instance."); 8838 8839 unsigned Part = State.Instance->Part; 8840 unsigned Lane = State.Instance->Lane; 8841 8842 Value *ConditionBit = nullptr; 8843 VPValue *BlockInMask = getMask(); 8844 if (BlockInMask) { 8845 ConditionBit = State.get(BlockInMask, Part); 8846 if (ConditionBit->getType()->isVectorTy()) 8847 ConditionBit = State.Builder.CreateExtractElement( 8848 ConditionBit, State.Builder.getInt32(Lane)); 8849 } else // Block in mask is all-one. 8850 ConditionBit = State.Builder.getTrue(); 8851 8852 // Replace the temporary unreachable terminator with a new conditional branch, 8853 // whose two destinations will be set later when they are created. 8854 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8855 assert(isa<UnreachableInst>(CurrentTerminator) && 8856 "Expected to replace unreachable terminator with conditional branch."); 8857 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8858 CondBr->setSuccessor(0, nullptr); 8859 ReplaceInstWithInst(CurrentTerminator, CondBr); 8860 } 8861 8862 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8863 assert(State.Instance && "Predicated instruction PHI works per instance."); 8864 Instruction *ScalarPredInst = 8865 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8866 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8867 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8868 assert(PredicatingBB && "Predicated block has no single predecessor."); 8869 8870 // By current pack/unpack logic we need to generate only a single phi node: if 8871 // a vector value for the predicated instruction exists at this point it means 8872 // the instruction has vector users only, and a phi for the vector value is 8873 // needed. In this case the recipe of the predicated instruction is marked to 8874 // also do that packing, thereby "hoisting" the insert-element sequence. 8875 // Otherwise, a phi node for the scalar value is needed. 8876 unsigned Part = State.Instance->Part; 8877 Instruction *PredInst = 8878 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8879 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8880 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8881 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8882 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8883 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8884 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8885 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8886 } else { 8887 Type *PredInstType = PredInst->getType(); 8888 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8889 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 8890 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8891 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8892 } 8893 } 8894 8895 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8896 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8897 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8898 StoredValue ? nullptr : getVPValue(), 8899 getAddr(), StoredValue, getMask()); 8900 } 8901 8902 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8903 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8904 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8905 // for predication. 8906 static ScalarEpilogueLowering getScalarEpilogueLowering( 8907 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8908 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8909 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8910 LoopVectorizationLegality &LVL) { 8911 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8912 // don't look at hints or options, and don't request a scalar epilogue. 8913 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8914 // LoopAccessInfo (due to code dependency and not being able to reliably get 8915 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8916 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8917 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8918 // back to the old way and vectorize with versioning when forced. See D81345.) 8919 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8920 PGSOQueryType::IRPass) && 8921 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8922 return CM_ScalarEpilogueNotAllowedOptSize; 8923 8924 // 2) If set, obey the directives 8925 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 8926 switch (PreferPredicateOverEpilogue) { 8927 case PreferPredicateTy::ScalarEpilogue: 8928 return CM_ScalarEpilogueAllowed; 8929 case PreferPredicateTy::PredicateElseScalarEpilogue: 8930 return CM_ScalarEpilogueNotNeededUsePredicate; 8931 case PreferPredicateTy::PredicateOrDontVectorize: 8932 return CM_ScalarEpilogueNotAllowedUsePredicate; 8933 }; 8934 } 8935 8936 // 3) If set, obey the hints 8937 switch (Hints.getPredicate()) { 8938 case LoopVectorizeHints::FK_Enabled: 8939 return CM_ScalarEpilogueNotNeededUsePredicate; 8940 case LoopVectorizeHints::FK_Disabled: 8941 return CM_ScalarEpilogueAllowed; 8942 }; 8943 8944 // 4) if the TTI hook indicates this is profitable, request predication. 8945 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8946 LVL.getLAI())) 8947 return CM_ScalarEpilogueNotNeededUsePredicate; 8948 8949 return CM_ScalarEpilogueAllowed; 8950 } 8951 8952 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8953 unsigned Part) { 8954 set(Def, V, Part); 8955 ILV->setVectorValue(IRDef, Part, V); 8956 } 8957 8958 // Process the loop in the VPlan-native vectorization path. This path builds 8959 // VPlan upfront in the vectorization pipeline, which allows to apply 8960 // VPlan-to-VPlan transformations from the very beginning without modifying the 8961 // input LLVM IR. 8962 static bool processLoopInVPlanNativePath( 8963 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8964 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8965 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8966 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8967 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8968 8969 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8970 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8971 return false; 8972 } 8973 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8974 Function *F = L->getHeader()->getParent(); 8975 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8976 8977 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8978 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8979 8980 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8981 &Hints, IAI); 8982 // Use the planner for outer loop vectorization. 8983 // TODO: CM is not used at this point inside the planner. Turn CM into an 8984 // optional argument if we don't need it in the future. 8985 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8986 8987 // Get user vectorization factor. 8988 ElementCount UserVF = Hints.getWidth(); 8989 8990 // Plan how to best vectorize, return the best VF and its cost. 8991 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8992 8993 // If we are stress testing VPlan builds, do not attempt to generate vector 8994 // code. Masked vector code generation support will follow soon. 8995 // Also, do not attempt to vectorize if no vector code will be produced. 8996 if (VPlanBuildStressTest || EnableVPlanPredication || 8997 VectorizationFactor::Disabled() == VF) 8998 return false; 8999 9000 LVP.setBestPlan(VF.Width, 1); 9001 9002 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9003 &CM, BFI, PSI); 9004 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9005 << L->getHeader()->getParent()->getName() << "\"\n"); 9006 LVP.executePlan(LB, DT); 9007 9008 // Mark the loop as already vectorized to avoid vectorizing again. 9009 Hints.setAlreadyVectorized(); 9010 9011 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9012 return true; 9013 } 9014 9015 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9016 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9017 !EnableLoopInterleaving), 9018 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9019 !EnableLoopVectorization) {} 9020 9021 bool LoopVectorizePass::processLoop(Loop *L) { 9022 assert((EnableVPlanNativePath || L->isInnermost()) && 9023 "VPlan-native path is not enabled. Only process inner loops."); 9024 9025 #ifndef NDEBUG 9026 const std::string DebugLocStr = getDebugLocString(L); 9027 #endif /* NDEBUG */ 9028 9029 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9030 << L->getHeader()->getParent()->getName() << "\" from " 9031 << DebugLocStr << "\n"); 9032 9033 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9034 9035 LLVM_DEBUG( 9036 dbgs() << "LV: Loop hints:" 9037 << " force=" 9038 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9039 ? "disabled" 9040 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9041 ? "enabled" 9042 : "?")) 9043 << " width=" << Hints.getWidth() 9044 << " unroll=" << Hints.getInterleave() << "\n"); 9045 9046 // Function containing loop 9047 Function *F = L->getHeader()->getParent(); 9048 9049 // Looking at the diagnostic output is the only way to determine if a loop 9050 // was vectorized (other than looking at the IR or machine code), so it 9051 // is important to generate an optimization remark for each loop. Most of 9052 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9053 // generated as OptimizationRemark and OptimizationRemarkMissed are 9054 // less verbose reporting vectorized loops and unvectorized loops that may 9055 // benefit from vectorization, respectively. 9056 9057 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9058 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9059 return false; 9060 } 9061 9062 PredicatedScalarEvolution PSE(*SE, *L); 9063 9064 // Check if it is legal to vectorize the loop. 9065 LoopVectorizationRequirements Requirements(*ORE); 9066 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9067 &Requirements, &Hints, DB, AC, BFI, PSI); 9068 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9069 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9070 Hints.emitRemarkWithHints(); 9071 return false; 9072 } 9073 9074 // Check the function attributes and profiles to find out if this function 9075 // should be optimized for size. 9076 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9077 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9078 9079 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9080 // here. They may require CFG and instruction level transformations before 9081 // even evaluating whether vectorization is profitable. Since we cannot modify 9082 // the incoming IR, we need to build VPlan upfront in the vectorization 9083 // pipeline. 9084 if (!L->isInnermost()) 9085 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9086 ORE, BFI, PSI, Hints); 9087 9088 assert(L->isInnermost() && "Inner loop expected."); 9089 9090 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9091 // count by optimizing for size, to minimize overheads. 9092 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9093 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9094 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9095 << "This loop is worth vectorizing only if no scalar " 9096 << "iteration overheads are incurred."); 9097 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9098 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9099 else { 9100 LLVM_DEBUG(dbgs() << "\n"); 9101 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9102 } 9103 } 9104 9105 // Check the function attributes to see if implicit floats are allowed. 9106 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9107 // an integer loop and the vector instructions selected are purely integer 9108 // vector instructions? 9109 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9110 reportVectorizationFailure( 9111 "Can't vectorize when the NoImplicitFloat attribute is used", 9112 "loop not vectorized due to NoImplicitFloat attribute", 9113 "NoImplicitFloat", ORE, L); 9114 Hints.emitRemarkWithHints(); 9115 return false; 9116 } 9117 9118 // Check if the target supports potentially unsafe FP vectorization. 9119 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9120 // for the target we're vectorizing for, to make sure none of the 9121 // additional fp-math flags can help. 9122 if (Hints.isPotentiallyUnsafe() && 9123 TTI->isFPVectorizationPotentiallyUnsafe()) { 9124 reportVectorizationFailure( 9125 "Potentially unsafe FP op prevents vectorization", 9126 "loop not vectorized due to unsafe FP support.", 9127 "UnsafeFP", ORE, L); 9128 Hints.emitRemarkWithHints(); 9129 return false; 9130 } 9131 9132 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9133 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9134 9135 // If an override option has been passed in for interleaved accesses, use it. 9136 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9137 UseInterleaved = EnableInterleavedMemAccesses; 9138 9139 // Analyze interleaved memory accesses. 9140 if (UseInterleaved) { 9141 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9142 } 9143 9144 // Use the cost model. 9145 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9146 F, &Hints, IAI); 9147 CM.collectValuesToIgnore(); 9148 9149 // Use the planner for vectorization. 9150 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9151 9152 // Get user vectorization factor and interleave count. 9153 ElementCount UserVF = Hints.getWidth(); 9154 unsigned UserIC = Hints.getInterleave(); 9155 9156 // Plan how to best vectorize, return the best VF and its cost. 9157 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9158 9159 VectorizationFactor VF = VectorizationFactor::Disabled(); 9160 unsigned IC = 1; 9161 9162 if (MaybeVF) { 9163 VF = *MaybeVF; 9164 // Select the interleave count. 9165 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9166 } 9167 9168 // Identify the diagnostic messages that should be produced. 9169 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9170 bool VectorizeLoop = true, InterleaveLoop = true; 9171 if (Requirements.doesNotMeet(F, L, Hints)) { 9172 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9173 "requirements.\n"); 9174 Hints.emitRemarkWithHints(); 9175 return false; 9176 } 9177 9178 if (VF.Width.isScalar()) { 9179 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9180 VecDiagMsg = std::make_pair( 9181 "VectorizationNotBeneficial", 9182 "the cost-model indicates that vectorization is not beneficial"); 9183 VectorizeLoop = false; 9184 } 9185 9186 if (!MaybeVF && UserIC > 1) { 9187 // Tell the user interleaving was avoided up-front, despite being explicitly 9188 // requested. 9189 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9190 "interleaving should be avoided up front\n"); 9191 IntDiagMsg = std::make_pair( 9192 "InterleavingAvoided", 9193 "Ignoring UserIC, because interleaving was avoided up front"); 9194 InterleaveLoop = false; 9195 } else if (IC == 1 && UserIC <= 1) { 9196 // Tell the user interleaving is not beneficial. 9197 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9198 IntDiagMsg = std::make_pair( 9199 "InterleavingNotBeneficial", 9200 "the cost-model indicates that interleaving is not beneficial"); 9201 InterleaveLoop = false; 9202 if (UserIC == 1) { 9203 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9204 IntDiagMsg.second += 9205 " and is explicitly disabled or interleave count is set to 1"; 9206 } 9207 } else if (IC > 1 && UserIC == 1) { 9208 // Tell the user interleaving is beneficial, but it explicitly disabled. 9209 LLVM_DEBUG( 9210 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9211 IntDiagMsg = std::make_pair( 9212 "InterleavingBeneficialButDisabled", 9213 "the cost-model indicates that interleaving is beneficial " 9214 "but is explicitly disabled or interleave count is set to 1"); 9215 InterleaveLoop = false; 9216 } 9217 9218 // Override IC if user provided an interleave count. 9219 IC = UserIC > 0 ? UserIC : IC; 9220 9221 // Emit diagnostic messages, if any. 9222 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9223 if (!VectorizeLoop && !InterleaveLoop) { 9224 // Do not vectorize or interleaving the loop. 9225 ORE->emit([&]() { 9226 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9227 L->getStartLoc(), L->getHeader()) 9228 << VecDiagMsg.second; 9229 }); 9230 ORE->emit([&]() { 9231 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9232 L->getStartLoc(), L->getHeader()) 9233 << IntDiagMsg.second; 9234 }); 9235 return false; 9236 } else if (!VectorizeLoop && InterleaveLoop) { 9237 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9238 ORE->emit([&]() { 9239 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9240 L->getStartLoc(), L->getHeader()) 9241 << VecDiagMsg.second; 9242 }); 9243 } else if (VectorizeLoop && !InterleaveLoop) { 9244 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9245 << ") in " << DebugLocStr << '\n'); 9246 ORE->emit([&]() { 9247 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9248 L->getStartLoc(), L->getHeader()) 9249 << IntDiagMsg.second; 9250 }); 9251 } else if (VectorizeLoop && InterleaveLoop) { 9252 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9253 << ") in " << DebugLocStr << '\n'); 9254 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9255 } 9256 9257 LVP.setBestPlan(VF.Width, IC); 9258 9259 using namespace ore; 9260 bool DisableRuntimeUnroll = false; 9261 MDNode *OrigLoopID = L->getLoopID(); 9262 9263 if (!VectorizeLoop) { 9264 assert(IC > 1 && "interleave count should not be 1 or 0"); 9265 // If we decided that it is not legal to vectorize the loop, then 9266 // interleave it. 9267 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9268 BFI, PSI); 9269 LVP.executePlan(Unroller, DT); 9270 9271 ORE->emit([&]() { 9272 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9273 L->getHeader()) 9274 << "interleaved loop (interleaved count: " 9275 << NV("InterleaveCount", IC) << ")"; 9276 }); 9277 } else { 9278 // If we decided that it is *legal* to vectorize the loop, then do it. 9279 9280 // Consider vectorizing the epilogue too if it's profitable. 9281 VectorizationFactor EpilogueVF = 9282 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9283 if (EpilogueVF.Width.isVector()) { 9284 9285 // The first pass vectorizes the main loop and creates a scalar epilogue 9286 // to be vectorized by executing the plan (potentially with a different 9287 // factor) again shortly afterwards. 9288 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9289 EpilogueVF.Width.getKnownMinValue(), 1); 9290 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9291 &LVL, &CM, BFI, PSI); 9292 9293 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9294 LVP.executePlan(MainILV, DT); 9295 ++LoopsVectorized; 9296 9297 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9298 formLCSSARecursively(*L, *DT, LI, SE); 9299 9300 // Second pass vectorizes the epilogue and adjusts the control flow 9301 // edges from the first pass. 9302 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9303 EPI.MainLoopVF = EPI.EpilogueVF; 9304 EPI.MainLoopUF = EPI.EpilogueUF; 9305 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9306 ORE, EPI, &LVL, &CM, BFI, PSI); 9307 LVP.executePlan(EpilogILV, DT); 9308 ++LoopsEpilogueVectorized; 9309 9310 if (!MainILV.areSafetyChecksAdded()) 9311 DisableRuntimeUnroll = true; 9312 } else { 9313 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9314 &LVL, &CM, BFI, PSI); 9315 LVP.executePlan(LB, DT); 9316 ++LoopsVectorized; 9317 9318 // Add metadata to disable runtime unrolling a scalar loop when there are 9319 // no runtime checks about strides and memory. A scalar loop that is 9320 // rarely used is not worth unrolling. 9321 if (!LB.areSafetyChecksAdded()) 9322 DisableRuntimeUnroll = true; 9323 } 9324 9325 // Report the vectorization decision. 9326 ORE->emit([&]() { 9327 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9328 L->getHeader()) 9329 << "vectorized loop (vectorization width: " 9330 << NV("VectorizationFactor", VF.Width) 9331 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9332 }); 9333 } 9334 9335 Optional<MDNode *> RemainderLoopID = 9336 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9337 LLVMLoopVectorizeFollowupEpilogue}); 9338 if (RemainderLoopID.hasValue()) { 9339 L->setLoopID(RemainderLoopID.getValue()); 9340 } else { 9341 if (DisableRuntimeUnroll) 9342 AddRuntimeUnrollDisableMetaData(L); 9343 9344 // Mark the loop as already vectorized to avoid vectorizing again. 9345 Hints.setAlreadyVectorized(); 9346 } 9347 9348 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9349 return true; 9350 } 9351 9352 LoopVectorizeResult LoopVectorizePass::runImpl( 9353 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9354 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9355 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9356 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9357 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9358 SE = &SE_; 9359 LI = &LI_; 9360 TTI = &TTI_; 9361 DT = &DT_; 9362 BFI = &BFI_; 9363 TLI = TLI_; 9364 AA = &AA_; 9365 AC = &AC_; 9366 GetLAA = &GetLAA_; 9367 DB = &DB_; 9368 ORE = &ORE_; 9369 PSI = PSI_; 9370 9371 // Don't attempt if 9372 // 1. the target claims to have no vector registers, and 9373 // 2. interleaving won't help ILP. 9374 // 9375 // The second condition is necessary because, even if the target has no 9376 // vector registers, loop vectorization may still enable scalar 9377 // interleaving. 9378 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9379 TTI->getMaxInterleaveFactor(1) < 2) 9380 return LoopVectorizeResult(false, false); 9381 9382 bool Changed = false, CFGChanged = false; 9383 9384 // The vectorizer requires loops to be in simplified form. 9385 // Since simplification may add new inner loops, it has to run before the 9386 // legality and profitability checks. This means running the loop vectorizer 9387 // will simplify all loops, regardless of whether anything end up being 9388 // vectorized. 9389 for (auto &L : *LI) 9390 Changed |= CFGChanged |= 9391 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9392 9393 // Build up a worklist of inner-loops to vectorize. This is necessary as 9394 // the act of vectorizing or partially unrolling a loop creates new loops 9395 // and can invalidate iterators across the loops. 9396 SmallVector<Loop *, 8> Worklist; 9397 9398 for (Loop *L : *LI) 9399 collectSupportedLoops(*L, LI, ORE, Worklist); 9400 9401 LoopsAnalyzed += Worklist.size(); 9402 9403 // Now walk the identified inner loops. 9404 while (!Worklist.empty()) { 9405 Loop *L = Worklist.pop_back_val(); 9406 9407 // For the inner loops we actually process, form LCSSA to simplify the 9408 // transform. 9409 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9410 9411 Changed |= CFGChanged |= processLoop(L); 9412 } 9413 9414 // Process each loop nest in the function. 9415 return LoopVectorizeResult(Changed, CFGChanged); 9416 } 9417 9418 PreservedAnalyses LoopVectorizePass::run(Function &F, 9419 FunctionAnalysisManager &AM) { 9420 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9421 auto &LI = AM.getResult<LoopAnalysis>(F); 9422 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9423 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9424 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9425 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9426 auto &AA = AM.getResult<AAManager>(F); 9427 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9428 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9429 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9430 MemorySSA *MSSA = EnableMSSALoopDependency 9431 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9432 : nullptr; 9433 9434 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9435 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9436 [&](Loop &L) -> const LoopAccessInfo & { 9437 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9438 TLI, TTI, nullptr, MSSA}; 9439 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9440 }; 9441 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9442 ProfileSummaryInfo *PSI = 9443 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9444 LoopVectorizeResult Result = 9445 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9446 if (!Result.MadeAnyChange) 9447 return PreservedAnalyses::all(); 9448 PreservedAnalyses PA; 9449 9450 // We currently do not preserve loopinfo/dominator analyses with outer loop 9451 // vectorization. Until this is addressed, mark these analyses as preserved 9452 // only for non-VPlan-native path. 9453 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9454 if (!EnableVPlanNativePath) { 9455 PA.preserve<LoopAnalysis>(); 9456 PA.preserve<DominatorTreeAnalysis>(); 9457 } 9458 PA.preserve<BasicAA>(); 9459 PA.preserve<GlobalsAA>(); 9460 if (!Result.MadeCFGChange) 9461 PA.preserveSet<CFGAnalyses>(); 9462 return PA; 9463 } 9464