1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 // Determine if an array of VF elements of type Ty is "bitcast compatible" 371 // with a <VF x Ty> vector. 372 if (VF.isVector()) { 373 auto *VectorTy = VectorType::get(Ty, VF); 374 return TypeSize::get(VF.getKnownMinValue() * 375 DL.getTypeAllocSize(Ty).getFixedValue(), 376 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 377 } 378 379 // If the vectorization factor is one, we just check if an array of type Ty 380 // requires padding between elements. 381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 382 } 383 384 /// A helper function that returns the reciprocal of the block probability of 385 /// predicated blocks. If we return X, we are assuming the predicated block 386 /// will execute once for every X iterations of the loop header. 387 /// 388 /// TODO: We should use actual block probability here, if available. Currently, 389 /// we always assume predicated blocks have a 50% chance of executing. 390 static unsigned getReciprocalPredBlockProb() { return 2; } 391 392 /// A helper function that adds a 'fast' flag to floating-point operations. 393 static Value *addFastMathFlag(Value *V) { 394 if (isa<FPMathOperator>(V)) 395 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 396 return V; 397 } 398 399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 400 if (isa<FPMathOperator>(V)) 401 cast<Instruction>(V)->setFastMathFlags(FMF); 402 return V; 403 } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 namespace llvm { 436 437 /// InnerLoopVectorizer vectorizes loops which contain only one basic 438 /// block to a specified vectorization factor (VF). 439 /// This class performs the widening of scalars into vectors, or multiple 440 /// scalars. This class also implements the following features: 441 /// * It inserts an epilogue loop for handling loops that don't have iteration 442 /// counts that are known to be a multiple of the vectorization factor. 443 /// * It handles the code generation for reduction variables. 444 /// * Scalarization (implementation using scalars) of un-vectorizable 445 /// instructions. 446 /// InnerLoopVectorizer does not perform any vectorization-legality 447 /// checks, and relies on the caller to check for the different legality 448 /// aspects. The InnerLoopVectorizer relies on the 449 /// LoopVectorizationLegality class to provide information about the induction 450 /// and reduction variables that were found to a given vectorization factor. 451 class InnerLoopVectorizer { 452 public: 453 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 454 LoopInfo *LI, DominatorTree *DT, 455 const TargetLibraryInfo *TLI, 456 const TargetTransformInfo *TTI, AssumptionCache *AC, 457 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 458 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 459 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 460 ProfileSummaryInfo *PSI) 461 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 462 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 463 Builder(PSE.getSE()->getContext()), 464 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 465 BFI(BFI), PSI(PSI) { 466 // Query this against the original loop and save it here because the profile 467 // of the original loop header may change as the transformation happens. 468 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 469 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 470 } 471 472 virtual ~InnerLoopVectorizer() = default; 473 474 /// Create a new empty loop that will contain vectorized instructions later 475 /// on, while the old loop will be used as the scalar remainder. Control flow 476 /// is generated around the vectorized (and scalar epilogue) loops consisting 477 /// of various checks and bypasses. Return the pre-header block of the new 478 /// loop. 479 /// In the case of epilogue vectorization, this function is overriden to 480 /// handle the more complex control flow around the loops. 481 virtual BasicBlock *createVectorizedLoopSkeleton(); 482 483 /// Widen a single instruction within the innermost loop. 484 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 485 VPTransformState &State); 486 487 /// Widen a single call instruction within the innermost loop. 488 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 489 VPTransformState &State); 490 491 /// Widen a single select instruction within the innermost loop. 492 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 493 bool InvariantCond, VPTransformState &State); 494 495 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 496 void fixVectorizedLoop(); 497 498 // Return true if any runtime check is added. 499 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 500 501 /// A type for vectorized values in the new loop. Each value from the 502 /// original loop, when vectorized, is represented by UF vector values in the 503 /// new unrolled loop, where UF is the unroll factor. 504 using VectorParts = SmallVector<Value *, 2>; 505 506 /// Vectorize a single GetElementPtrInst based on information gathered and 507 /// decisions taken during planning. 508 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 509 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 510 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 511 512 /// Vectorize a single PHINode in a block. This method handles the induction 513 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 514 /// arbitrary length vectors. 515 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 516 517 /// A helper function to scalarize a single Instruction in the innermost loop. 518 /// Generates a sequence of scalar instances for each lane between \p MinLane 519 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 520 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 521 /// Instr's operands. 522 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 523 const VPIteration &Instance, bool IfPredicateInstr, 524 VPTransformState &State); 525 526 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 527 /// is provided, the integer induction variable will first be truncated to 528 /// the corresponding type. 529 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 530 531 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 532 /// vector or scalar value on-demand if one is not yet available. When 533 /// vectorizing a loop, we visit the definition of an instruction before its 534 /// uses. When visiting the definition, we either vectorize or scalarize the 535 /// instruction, creating an entry for it in the corresponding map. (In some 536 /// cases, such as induction variables, we will create both vector and scalar 537 /// entries.) Then, as we encounter uses of the definition, we derive values 538 /// for each scalar or vector use unless such a value is already available. 539 /// For example, if we scalarize a definition and one of its uses is vector, 540 /// we build the required vector on-demand with an insertelement sequence 541 /// when visiting the use. Otherwise, if the use is scalar, we can use the 542 /// existing scalar definition. 543 /// 544 /// Return a value in the new loop corresponding to \p V from the original 545 /// loop at unroll index \p Part. If the value has already been vectorized, 546 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 547 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 548 /// a new vector value on-demand by inserting the scalar values into a vector 549 /// with an insertelement sequence. If the value has been neither vectorized 550 /// nor scalarized, it must be loop invariant, so we simply broadcast the 551 /// value into a vector. 552 Value *getOrCreateVectorValue(Value *V, unsigned Part); 553 554 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 555 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 556 } 557 558 /// Return a value in the new loop corresponding to \p V from the original 559 /// loop at unroll and vector indices \p Instance. If the value has been 560 /// vectorized but not scalarized, the necessary extractelement instruction 561 /// will be generated. 562 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 563 564 /// Construct the vector value of a scalarized value \p V one lane at a time. 565 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 566 567 /// Try to vectorize interleaved access group \p Group with the base address 568 /// given in \p Addr, optionally masking the vector operations if \p 569 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 570 /// values in the vectorized loop. 571 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 572 ArrayRef<VPValue *> VPDefs, 573 VPTransformState &State, VPValue *Addr, 574 ArrayRef<VPValue *> StoredValues, 575 VPValue *BlockInMask = nullptr); 576 577 /// Vectorize Load and Store instructions with the base address given in \p 578 /// Addr, optionally masking the vector operations if \p BlockInMask is 579 /// non-null. Use \p State to translate given VPValues to IR values in the 580 /// vectorized loop. 581 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 582 VPValue *Def, VPValue *Addr, 583 VPValue *StoredValue, VPValue *BlockInMask); 584 585 /// Set the debug location in the builder using the debug location in 586 /// the instruction. 587 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 588 589 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 590 void fixNonInductionPHIs(void); 591 592 protected: 593 friend class LoopVectorizationPlanner; 594 595 /// A small list of PHINodes. 596 using PhiVector = SmallVector<PHINode *, 4>; 597 598 /// A type for scalarized values in the new loop. Each value from the 599 /// original loop, when scalarized, is represented by UF x VF scalar values 600 /// in the new unrolled loop, where UF is the unroll factor and VF is the 601 /// vectorization factor. 602 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 603 604 /// Set up the values of the IVs correctly when exiting the vector loop. 605 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 606 Value *CountRoundDown, Value *EndValue, 607 BasicBlock *MiddleBlock); 608 609 /// Create a new induction variable inside L. 610 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 611 Value *Step, Instruction *DL); 612 613 /// Handle all cross-iteration phis in the header. 614 void fixCrossIterationPHIs(); 615 616 /// Fix a first-order recurrence. This is the second phase of vectorizing 617 /// this phi node. 618 void fixFirstOrderRecurrence(PHINode *Phi); 619 620 /// Fix a reduction cross-iteration phi. This is the second phase of 621 /// vectorizing this phi node. 622 void fixReduction(PHINode *Phi); 623 624 /// Clear NSW/NUW flags from reduction instructions if necessary. 625 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 626 627 /// The Loop exit block may have single value PHI nodes with some 628 /// incoming value. While vectorizing we only handled real values 629 /// that were defined inside the loop and we should have one value for 630 /// each predecessor of its parent basic block. See PR14725. 631 void fixLCSSAPHIs(); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(); 640 641 /// Create a broadcast instruction. This method generates a broadcast 642 /// instruction (shuffle) for loop invariant values and for the induction 643 /// value. If this is the induction variable then we extend it to N, N+1, ... 644 /// this is needed because each iteration in the loop corresponds to a SIMD 645 /// element. 646 virtual Value *getBroadcastInstrs(Value *V); 647 648 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 649 /// to each vector element of Val. The sequence starts at StartIndex. 650 /// \p Opcode is relevant for FP induction variable. 651 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 652 Instruction::BinaryOps Opcode = 653 Instruction::BinaryOpsEnd); 654 655 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 656 /// variable on which to base the steps, \p Step is the size of the step, and 657 /// \p EntryVal is the value from the original loop that maps to the steps. 658 /// Note that \p EntryVal doesn't have to be an induction variable - it 659 /// can also be a truncate instruction. 660 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 661 const InductionDescriptor &ID); 662 663 /// Create a vector induction phi node based on an existing scalar one. \p 664 /// EntryVal is the value from the original loop that maps to the vector phi 665 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 666 /// truncate instruction, instead of widening the original IV, we widen a 667 /// version of the IV truncated to \p EntryVal's type. 668 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 669 Value *Step, Instruction *EntryVal); 670 671 /// Returns true if an instruction \p I should be scalarized instead of 672 /// vectorized for the chosen vectorization factor. 673 bool shouldScalarizeInstruction(Instruction *I) const; 674 675 /// Returns true if we should generate a scalar version of \p IV. 676 bool needsScalarInduction(Instruction *IV) const; 677 678 /// If there is a cast involved in the induction variable \p ID, which should 679 /// be ignored in the vectorized loop body, this function records the 680 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 681 /// cast. We had already proved that the casted Phi is equal to the uncasted 682 /// Phi in the vectorized loop (under a runtime guard), and therefore 683 /// there is no need to vectorize the cast - the same value can be used in the 684 /// vector loop for both the Phi and the cast. 685 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 686 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 687 /// 688 /// \p EntryVal is the value from the original loop that maps to the vector 689 /// phi node and is used to distinguish what is the IV currently being 690 /// processed - original one (if \p EntryVal is a phi corresponding to the 691 /// original IV) or the "newly-created" one based on the proof mentioned above 692 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 693 /// latter case \p EntryVal is a TruncInst and we must not record anything for 694 /// that IV, but it's error-prone to expect callers of this routine to care 695 /// about that, hence this explicit parameter. 696 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 697 const Instruction *EntryVal, 698 Value *VectorLoopValue, 699 unsigned Part, 700 unsigned Lane = UINT_MAX); 701 702 /// Generate a shuffle sequence that will reverse the vector Vec. 703 virtual Value *reverseVector(Value *Vec); 704 705 /// Returns (and creates if needed) the original loop trip count. 706 Value *getOrCreateTripCount(Loop *NewLoop); 707 708 /// Returns (and creates if needed) the trip count of the widened loop. 709 Value *getOrCreateVectorTripCount(Loop *NewLoop); 710 711 /// Returns a bitcasted value to the requested vector type. 712 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 713 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 714 const DataLayout &DL); 715 716 /// Emit a bypass check to see if the vector trip count is zero, including if 717 /// it overflows. 718 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 719 720 /// Emit a bypass check to see if all of the SCEV assumptions we've 721 /// had to make are correct. 722 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 723 724 /// Emit bypass checks to check any memory assumptions we may have made. 725 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 726 727 /// Compute the transformed value of Index at offset StartValue using step 728 /// StepValue. 729 /// For integer induction, returns StartValue + Index * StepValue. 730 /// For pointer induction, returns StartValue[Index * StepValue]. 731 /// FIXME: The newly created binary instructions should contain nsw/nuw 732 /// flags, which can be found from the original scalar operations. 733 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 734 const DataLayout &DL, 735 const InductionDescriptor &ID) const; 736 737 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 738 /// vector loop preheader, middle block and scalar preheader. Also 739 /// allocate a loop object for the new vector loop and return it. 740 Loop *createVectorLoopSkeleton(StringRef Prefix); 741 742 /// Create new phi nodes for the induction variables to resume iteration count 743 /// in the scalar epilogue, from where the vectorized loop left off (given by 744 /// \p VectorTripCount). 745 /// In cases where the loop skeleton is more complicated (eg. epilogue 746 /// vectorization) and the resume values can come from an additional bypass 747 /// block, the \p AdditionalBypass pair provides information about the bypass 748 /// block and the end value on the edge from bypass to this loop. 749 void createInductionResumeValues( 750 Loop *L, Value *VectorTripCount, 751 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 752 753 /// Complete the loop skeleton by adding debug MDs, creating appropriate 754 /// conditional branches in the middle block, preparing the builder and 755 /// running the verifier. Take in the vector loop \p L as argument, and return 756 /// the preheader of the completed vector loop. 757 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 758 759 /// Add additional metadata to \p To that was not present on \p Orig. 760 /// 761 /// Currently this is used to add the noalias annotations based on the 762 /// inserted memchecks. Use this for instructions that are *cloned* into the 763 /// vector loop. 764 void addNewMetadata(Instruction *To, const Instruction *Orig); 765 766 /// Add metadata from one instruction to another. 767 /// 768 /// This includes both the original MDs from \p From and additional ones (\see 769 /// addNewMetadata). Use this for *newly created* instructions in the vector 770 /// loop. 771 void addMetadata(Instruction *To, Instruction *From); 772 773 /// Similar to the previous function but it adds the metadata to a 774 /// vector of instructions. 775 void addMetadata(ArrayRef<Value *> To, Instruction *From); 776 777 /// Allow subclasses to override and print debug traces before/after vplan 778 /// execution, when trace information is requested. 779 virtual void printDebugTracesAtStart(){}; 780 virtual void printDebugTracesAtEnd(){}; 781 782 /// The original loop. 783 Loop *OrigLoop; 784 785 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 786 /// dynamic knowledge to simplify SCEV expressions and converts them to a 787 /// more usable form. 788 PredicatedScalarEvolution &PSE; 789 790 /// Loop Info. 791 LoopInfo *LI; 792 793 /// Dominator Tree. 794 DominatorTree *DT; 795 796 /// Alias Analysis. 797 AAResults *AA; 798 799 /// Target Library Info. 800 const TargetLibraryInfo *TLI; 801 802 /// Target Transform Info. 803 const TargetTransformInfo *TTI; 804 805 /// Assumption Cache. 806 AssumptionCache *AC; 807 808 /// Interface to emit optimization remarks. 809 OptimizationRemarkEmitter *ORE; 810 811 /// LoopVersioning. It's only set up (non-null) if memchecks were 812 /// used. 813 /// 814 /// This is currently only used to add no-alias metadata based on the 815 /// memchecks. The actually versioning is performed manually. 816 std::unique_ptr<LoopVersioning> LVer; 817 818 /// The vectorization SIMD factor to use. Each vector will have this many 819 /// vector elements. 820 ElementCount VF; 821 822 /// The vectorization unroll factor to use. Each scalar is vectorized to this 823 /// many different vector instructions. 824 unsigned UF; 825 826 /// The builder that we use 827 IRBuilder<> Builder; 828 829 // --- Vectorization state --- 830 831 /// The vector-loop preheader. 832 BasicBlock *LoopVectorPreHeader; 833 834 /// The scalar-loop preheader. 835 BasicBlock *LoopScalarPreHeader; 836 837 /// Middle Block between the vector and the scalar. 838 BasicBlock *LoopMiddleBlock; 839 840 /// The (unique) ExitBlock of the scalar loop. Note that 841 /// there can be multiple exiting edges reaching this block. 842 BasicBlock *LoopExitBlock; 843 844 /// The vector loop body. 845 BasicBlock *LoopVectorBody; 846 847 /// The scalar loop body. 848 BasicBlock *LoopScalarBody; 849 850 /// A list of all bypass blocks. The first block is the entry of the loop. 851 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 852 853 /// The new Induction variable which was added to the new block. 854 PHINode *Induction = nullptr; 855 856 /// The induction variable of the old basic block. 857 PHINode *OldInduction = nullptr; 858 859 /// Maps values from the original loop to their corresponding values in the 860 /// vectorized loop. A key value can map to either vector values, scalar 861 /// values or both kinds of values, depending on whether the key was 862 /// vectorized and scalarized. 863 VectorizerValueMap VectorLoopValueMap; 864 865 /// Store instructions that were predicated. 866 SmallVector<Instruction *, 4> PredicatedInstructions; 867 868 /// Trip count of the original loop. 869 Value *TripCount = nullptr; 870 871 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 872 Value *VectorTripCount = nullptr; 873 874 /// The legality analysis. 875 LoopVectorizationLegality *Legal; 876 877 /// The profitablity analysis. 878 LoopVectorizationCostModel *Cost; 879 880 // Record whether runtime checks are added. 881 bool AddedSafetyChecks = false; 882 883 // Holds the end values for each induction variable. We save the end values 884 // so we can later fix-up the external users of the induction variables. 885 DenseMap<PHINode *, Value *> IVEndValues; 886 887 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 888 // fixed up at the end of vector code generation. 889 SmallVector<PHINode *, 8> OrigPHIsToFix; 890 891 /// BFI and PSI are used to check for profile guided size optimizations. 892 BlockFrequencyInfo *BFI; 893 ProfileSummaryInfo *PSI; 894 895 // Whether this loop should be optimized for size based on profile guided size 896 // optimizatios. 897 bool OptForSizeBasedOnProfile; 898 }; 899 900 class InnerLoopUnroller : public InnerLoopVectorizer { 901 public: 902 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 903 LoopInfo *LI, DominatorTree *DT, 904 const TargetLibraryInfo *TLI, 905 const TargetTransformInfo *TTI, AssumptionCache *AC, 906 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 907 LoopVectorizationLegality *LVL, 908 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 909 ProfileSummaryInfo *PSI) 910 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 911 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 912 BFI, PSI) {} 913 914 private: 915 Value *getBroadcastInstrs(Value *V) override; 916 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 917 Instruction::BinaryOps Opcode = 918 Instruction::BinaryOpsEnd) override; 919 Value *reverseVector(Value *Vec) override; 920 }; 921 922 /// Encapsulate information regarding vectorization of a loop and its epilogue. 923 /// This information is meant to be updated and used across two stages of 924 /// epilogue vectorization. 925 struct EpilogueLoopVectorizationInfo { 926 ElementCount MainLoopVF = ElementCount::getFixed(0); 927 unsigned MainLoopUF = 0; 928 ElementCount EpilogueVF = ElementCount::getFixed(0); 929 unsigned EpilogueUF = 0; 930 BasicBlock *MainLoopIterationCountCheck = nullptr; 931 BasicBlock *EpilogueIterationCountCheck = nullptr; 932 BasicBlock *SCEVSafetyCheck = nullptr; 933 BasicBlock *MemSafetyCheck = nullptr; 934 Value *TripCount = nullptr; 935 Value *VectorTripCount = nullptr; 936 937 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 938 unsigned EUF) 939 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 940 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 941 assert(EUF == 1 && 942 "A high UF for the epilogue loop is likely not beneficial."); 943 } 944 }; 945 946 /// An extension of the inner loop vectorizer that creates a skeleton for a 947 /// vectorized loop that has its epilogue (residual) also vectorized. 948 /// The idea is to run the vplan on a given loop twice, firstly to setup the 949 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 950 /// from the first step and vectorize the epilogue. This is achieved by 951 /// deriving two concrete strategy classes from this base class and invoking 952 /// them in succession from the loop vectorizer planner. 953 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 954 public: 955 InnerLoopAndEpilogueVectorizer( 956 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 957 DominatorTree *DT, const TargetLibraryInfo *TLI, 958 const TargetTransformInfo *TTI, AssumptionCache *AC, 959 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 960 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 961 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 962 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 963 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 964 EPI(EPI) {} 965 966 // Override this function to handle the more complex control flow around the 967 // three loops. 968 BasicBlock *createVectorizedLoopSkeleton() final override { 969 return createEpilogueVectorizedLoopSkeleton(); 970 } 971 972 /// The interface for creating a vectorized skeleton using one of two 973 /// different strategies, each corresponding to one execution of the vplan 974 /// as described above. 975 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 976 977 /// Holds and updates state information required to vectorize the main loop 978 /// and its epilogue in two separate passes. This setup helps us avoid 979 /// regenerating and recomputing runtime safety checks. It also helps us to 980 /// shorten the iteration-count-check path length for the cases where the 981 /// iteration count of the loop is so small that the main vector loop is 982 /// completely skipped. 983 EpilogueLoopVectorizationInfo &EPI; 984 }; 985 986 /// A specialized derived class of inner loop vectorizer that performs 987 /// vectorization of *main* loops in the process of vectorizing loops and their 988 /// epilogues. 989 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 990 public: 991 EpilogueVectorizerMainLoop( 992 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 993 DominatorTree *DT, const TargetLibraryInfo *TLI, 994 const TargetTransformInfo *TTI, AssumptionCache *AC, 995 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 996 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 997 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 998 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 999 EPI, LVL, CM, BFI, PSI) {} 1000 /// Implements the interface for creating a vectorized skeleton using the 1001 /// *main loop* strategy (ie the first pass of vplan execution). 1002 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1003 1004 protected: 1005 /// Emits an iteration count bypass check once for the main loop (when \p 1006 /// ForEpilogue is false) and once for the epilogue loop (when \p 1007 /// ForEpilogue is true). 1008 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1009 bool ForEpilogue); 1010 void printDebugTracesAtStart() override; 1011 void printDebugTracesAtEnd() override; 1012 }; 1013 1014 // A specialized derived class of inner loop vectorizer that performs 1015 // vectorization of *epilogue* loops in the process of vectorizing loops and 1016 // their epilogues. 1017 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1018 public: 1019 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1020 LoopInfo *LI, DominatorTree *DT, 1021 const TargetLibraryInfo *TLI, 1022 const TargetTransformInfo *TTI, AssumptionCache *AC, 1023 OptimizationRemarkEmitter *ORE, 1024 EpilogueLoopVectorizationInfo &EPI, 1025 LoopVectorizationLegality *LVL, 1026 llvm::LoopVectorizationCostModel *CM, 1027 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1028 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1029 EPI, LVL, CM, BFI, PSI) {} 1030 /// Implements the interface for creating a vectorized skeleton using the 1031 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1032 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1033 1034 protected: 1035 /// Emits an iteration count bypass check after the main vector loop has 1036 /// finished to see if there are any iterations left to execute by either 1037 /// the vector epilogue or the scalar epilogue. 1038 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1039 BasicBlock *Bypass, 1040 BasicBlock *Insert); 1041 void printDebugTracesAtStart() override; 1042 void printDebugTracesAtEnd() override; 1043 }; 1044 } // end namespace llvm 1045 1046 /// Look for a meaningful debug location on the instruction or it's 1047 /// operands. 1048 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1049 if (!I) 1050 return I; 1051 1052 DebugLoc Empty; 1053 if (I->getDebugLoc() != Empty) 1054 return I; 1055 1056 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1057 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1058 if (OpInst->getDebugLoc() != Empty) 1059 return OpInst; 1060 } 1061 1062 return I; 1063 } 1064 1065 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1066 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1067 const DILocation *DIL = Inst->getDebugLoc(); 1068 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1069 !isa<DbgInfoIntrinsic>(Inst)) { 1070 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1071 auto NewDIL = 1072 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1073 if (NewDIL) 1074 B.SetCurrentDebugLocation(NewDIL.getValue()); 1075 else 1076 LLVM_DEBUG(dbgs() 1077 << "Failed to create new discriminator: " 1078 << DIL->getFilename() << " Line: " << DIL->getLine()); 1079 } 1080 else 1081 B.SetCurrentDebugLocation(DIL); 1082 } else 1083 B.SetCurrentDebugLocation(DebugLoc()); 1084 } 1085 1086 /// Write a record \p DebugMsg about vectorization failure to the debug 1087 /// output stream. If \p I is passed, it is an instruction that prevents 1088 /// vectorization. 1089 #ifndef NDEBUG 1090 static void debugVectorizationFailure(const StringRef DebugMsg, 1091 Instruction *I) { 1092 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1093 if (I != nullptr) 1094 dbgs() << " " << *I; 1095 else 1096 dbgs() << '.'; 1097 dbgs() << '\n'; 1098 } 1099 #endif 1100 1101 /// Create an analysis remark that explains why vectorization failed 1102 /// 1103 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1104 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1105 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1106 /// the location of the remark. \return the remark object that can be 1107 /// streamed to. 1108 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1109 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1110 Value *CodeRegion = TheLoop->getHeader(); 1111 DebugLoc DL = TheLoop->getStartLoc(); 1112 1113 if (I) { 1114 CodeRegion = I->getParent(); 1115 // If there is no debug location attached to the instruction, revert back to 1116 // using the loop's. 1117 if (I->getDebugLoc()) 1118 DL = I->getDebugLoc(); 1119 } 1120 1121 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1122 R << "loop not vectorized: "; 1123 return R; 1124 } 1125 1126 /// Return a value for Step multiplied by VF. 1127 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1128 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1129 Constant *StepVal = ConstantInt::get( 1130 Step->getType(), 1131 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1132 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1133 } 1134 1135 namespace llvm { 1136 1137 void reportVectorizationFailure(const StringRef DebugMsg, 1138 const StringRef OREMsg, const StringRef ORETag, 1139 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1140 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1141 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1142 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1143 ORETag, TheLoop, I) << OREMsg); 1144 } 1145 1146 } // end namespace llvm 1147 1148 #ifndef NDEBUG 1149 /// \return string containing a file name and a line # for the given loop. 1150 static std::string getDebugLocString(const Loop *L) { 1151 std::string Result; 1152 if (L) { 1153 raw_string_ostream OS(Result); 1154 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1155 LoopDbgLoc.print(OS); 1156 else 1157 // Just print the module name. 1158 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1159 OS.flush(); 1160 } 1161 return Result; 1162 } 1163 #endif 1164 1165 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1166 const Instruction *Orig) { 1167 // If the loop was versioned with memchecks, add the corresponding no-alias 1168 // metadata. 1169 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1170 LVer->annotateInstWithNoAlias(To, Orig); 1171 } 1172 1173 void InnerLoopVectorizer::addMetadata(Instruction *To, 1174 Instruction *From) { 1175 propagateMetadata(To, From); 1176 addNewMetadata(To, From); 1177 } 1178 1179 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1180 Instruction *From) { 1181 for (Value *V : To) { 1182 if (Instruction *I = dyn_cast<Instruction>(V)) 1183 addMetadata(I, From); 1184 } 1185 } 1186 1187 namespace llvm { 1188 1189 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1190 // lowered. 1191 enum ScalarEpilogueLowering { 1192 1193 // The default: allowing scalar epilogues. 1194 CM_ScalarEpilogueAllowed, 1195 1196 // Vectorization with OptForSize: don't allow epilogues. 1197 CM_ScalarEpilogueNotAllowedOptSize, 1198 1199 // A special case of vectorisation with OptForSize: loops with a very small 1200 // trip count are considered for vectorization under OptForSize, thereby 1201 // making sure the cost of their loop body is dominant, free of runtime 1202 // guards and scalar iteration overheads. 1203 CM_ScalarEpilogueNotAllowedLowTripLoop, 1204 1205 // Loop hint predicate indicating an epilogue is undesired. 1206 CM_ScalarEpilogueNotNeededUsePredicate, 1207 1208 // Directive indicating we must either tail fold or not vectorize 1209 CM_ScalarEpilogueNotAllowedUsePredicate 1210 }; 1211 1212 /// LoopVectorizationCostModel - estimates the expected speedups due to 1213 /// vectorization. 1214 /// In many cases vectorization is not profitable. This can happen because of 1215 /// a number of reasons. In this class we mainly attempt to predict the 1216 /// expected speedup/slowdowns due to the supported instruction set. We use the 1217 /// TargetTransformInfo to query the different backends for the cost of 1218 /// different operations. 1219 class LoopVectorizationCostModel { 1220 public: 1221 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1222 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1223 LoopVectorizationLegality *Legal, 1224 const TargetTransformInfo &TTI, 1225 const TargetLibraryInfo *TLI, DemandedBits *DB, 1226 AssumptionCache *AC, 1227 OptimizationRemarkEmitter *ORE, const Function *F, 1228 const LoopVectorizeHints *Hints, 1229 InterleavedAccessInfo &IAI) 1230 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1231 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1232 Hints(Hints), InterleaveInfo(IAI) {} 1233 1234 /// \return An upper bound for the vectorization factor, or None if 1235 /// vectorization and interleaving should be avoided up front. 1236 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1237 1238 /// \return True if runtime checks are required for vectorization, and false 1239 /// otherwise. 1240 bool runtimeChecksRequired(); 1241 1242 /// \return The most profitable vectorization factor and the cost of that VF. 1243 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1244 /// then this vectorization factor will be selected if vectorization is 1245 /// possible. 1246 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1247 VectorizationFactor 1248 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1249 const LoopVectorizationPlanner &LVP); 1250 1251 /// Setup cost-based decisions for user vectorization factor. 1252 void selectUserVectorizationFactor(ElementCount UserVF) { 1253 collectUniformsAndScalars(UserVF); 1254 collectInstsToScalarize(UserVF); 1255 } 1256 1257 /// \return The size (in bits) of the smallest and widest types in the code 1258 /// that needs to be vectorized. We ignore values that remain scalar such as 1259 /// 64 bit loop indices. 1260 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1261 1262 /// \return The desired interleave count. 1263 /// If interleave count has been specified by metadata it will be returned. 1264 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1265 /// are the selected vectorization factor and the cost of the selected VF. 1266 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1267 1268 /// Memory access instruction may be vectorized in more than one way. 1269 /// Form of instruction after vectorization depends on cost. 1270 /// This function takes cost-based decisions for Load/Store instructions 1271 /// and collects them in a map. This decisions map is used for building 1272 /// the lists of loop-uniform and loop-scalar instructions. 1273 /// The calculated cost is saved with widening decision in order to 1274 /// avoid redundant calculations. 1275 void setCostBasedWideningDecision(ElementCount VF); 1276 1277 /// A struct that represents some properties of the register usage 1278 /// of a loop. 1279 struct RegisterUsage { 1280 /// Holds the number of loop invariant values that are used in the loop. 1281 /// The key is ClassID of target-provided register class. 1282 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1283 /// Holds the maximum number of concurrent live intervals in the loop. 1284 /// The key is ClassID of target-provided register class. 1285 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1286 }; 1287 1288 /// \return Returns information about the register usages of the loop for the 1289 /// given vectorization factors. 1290 SmallVector<RegisterUsage, 8> 1291 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1292 1293 /// Collect values we want to ignore in the cost model. 1294 void collectValuesToIgnore(); 1295 1296 /// Split reductions into those that happen in the loop, and those that happen 1297 /// outside. In loop reductions are collected into InLoopReductionChains. 1298 void collectInLoopReductions(); 1299 1300 /// \returns The smallest bitwidth each instruction can be represented with. 1301 /// The vector equivalents of these instructions should be truncated to this 1302 /// type. 1303 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1304 return MinBWs; 1305 } 1306 1307 /// \returns True if it is more profitable to scalarize instruction \p I for 1308 /// vectorization factor \p VF. 1309 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1310 assert(VF.isVector() && 1311 "Profitable to scalarize relevant only for VF > 1."); 1312 1313 // Cost model is not run in the VPlan-native path - return conservative 1314 // result until this changes. 1315 if (EnableVPlanNativePath) 1316 return false; 1317 1318 auto Scalars = InstsToScalarize.find(VF); 1319 assert(Scalars != InstsToScalarize.end() && 1320 "VF not yet analyzed for scalarization profitability"); 1321 return Scalars->second.find(I) != Scalars->second.end(); 1322 } 1323 1324 /// Returns true if \p I is known to be uniform after vectorization. 1325 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1326 if (VF.isScalar()) 1327 return true; 1328 1329 // Cost model is not run in the VPlan-native path - return conservative 1330 // result until this changes. 1331 if (EnableVPlanNativePath) 1332 return false; 1333 1334 auto UniformsPerVF = Uniforms.find(VF); 1335 assert(UniformsPerVF != Uniforms.end() && 1336 "VF not yet analyzed for uniformity"); 1337 return UniformsPerVF->second.count(I); 1338 } 1339 1340 /// Returns true if \p I is known to be scalar after vectorization. 1341 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1342 if (VF.isScalar()) 1343 return true; 1344 1345 // Cost model is not run in the VPlan-native path - return conservative 1346 // result until this changes. 1347 if (EnableVPlanNativePath) 1348 return false; 1349 1350 auto ScalarsPerVF = Scalars.find(VF); 1351 assert(ScalarsPerVF != Scalars.end() && 1352 "Scalar values are not calculated for VF"); 1353 return ScalarsPerVF->second.count(I); 1354 } 1355 1356 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1357 /// for vectorization factor \p VF. 1358 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1359 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1360 !isProfitableToScalarize(I, VF) && 1361 !isScalarAfterVectorization(I, VF); 1362 } 1363 1364 /// Decision that was taken during cost calculation for memory instruction. 1365 enum InstWidening { 1366 CM_Unknown, 1367 CM_Widen, // For consecutive accesses with stride +1. 1368 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1369 CM_Interleave, 1370 CM_GatherScatter, 1371 CM_Scalarize 1372 }; 1373 1374 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1375 /// instruction \p I and vector width \p VF. 1376 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1377 unsigned Cost) { 1378 assert(VF.isVector() && "Expected VF >=2"); 1379 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1380 } 1381 1382 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1383 /// interleaving group \p Grp and vector width \p VF. 1384 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1385 ElementCount VF, InstWidening W, unsigned Cost) { 1386 assert(VF.isVector() && "Expected VF >=2"); 1387 /// Broadcast this decicion to all instructions inside the group. 1388 /// But the cost will be assigned to one instruction only. 1389 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1390 if (auto *I = Grp->getMember(i)) { 1391 if (Grp->getInsertPos() == I) 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1393 else 1394 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1395 } 1396 } 1397 } 1398 1399 /// Return the cost model decision for the given instruction \p I and vector 1400 /// width \p VF. Return CM_Unknown if this instruction did not pass 1401 /// through the cost modeling. 1402 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1403 assert(VF.isVector() && "Expected VF to be a vector VF"); 1404 // Cost model is not run in the VPlan-native path - return conservative 1405 // result until this changes. 1406 if (EnableVPlanNativePath) 1407 return CM_GatherScatter; 1408 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 auto Itr = WideningDecisions.find(InstOnVF); 1411 if (Itr == WideningDecisions.end()) 1412 return CM_Unknown; 1413 return Itr->second.first; 1414 } 1415 1416 /// Return the vectorization cost for the given instruction \p I and vector 1417 /// width \p VF. 1418 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1419 assert(VF.isVector() && "Expected VF >=2"); 1420 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1421 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1422 "The cost is not calculated"); 1423 return WideningDecisions[InstOnVF].second; 1424 } 1425 1426 /// Return True if instruction \p I is an optimizable truncate whose operand 1427 /// is an induction variable. Such a truncate will be removed by adding a new 1428 /// induction variable with the destination type. 1429 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1430 // If the instruction is not a truncate, return false. 1431 auto *Trunc = dyn_cast<TruncInst>(I); 1432 if (!Trunc) 1433 return false; 1434 1435 // Get the source and destination types of the truncate. 1436 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1437 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1438 1439 // If the truncate is free for the given types, return false. Replacing a 1440 // free truncate with an induction variable would add an induction variable 1441 // update instruction to each iteration of the loop. We exclude from this 1442 // check the primary induction variable since it will need an update 1443 // instruction regardless. 1444 Value *Op = Trunc->getOperand(0); 1445 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1446 return false; 1447 1448 // If the truncated value is not an induction variable, return false. 1449 return Legal->isInductionPhi(Op); 1450 } 1451 1452 /// Collects the instructions to scalarize for each predicated instruction in 1453 /// the loop. 1454 void collectInstsToScalarize(ElementCount VF); 1455 1456 /// Collect Uniform and Scalar values for the given \p VF. 1457 /// The sets depend on CM decision for Load/Store instructions 1458 /// that may be vectorized as interleave, gather-scatter or scalarized. 1459 void collectUniformsAndScalars(ElementCount VF) { 1460 // Do the analysis once. 1461 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1462 return; 1463 setCostBasedWideningDecision(VF); 1464 collectLoopUniforms(VF); 1465 collectLoopScalars(VF); 1466 } 1467 1468 /// Returns true if the target machine supports masked store operation 1469 /// for the given \p DataType and kind of access to \p Ptr. 1470 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1471 return Legal->isConsecutivePtr(Ptr) && 1472 TTI.isLegalMaskedStore(DataType, Alignment); 1473 } 1474 1475 /// Returns true if the target machine supports masked load operation 1476 /// for the given \p DataType and kind of access to \p Ptr. 1477 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1478 return Legal->isConsecutivePtr(Ptr) && 1479 TTI.isLegalMaskedLoad(DataType, Alignment); 1480 } 1481 1482 /// Returns true if the target machine supports masked scatter operation 1483 /// for the given \p DataType. 1484 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1485 return TTI.isLegalMaskedScatter(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked gather operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1491 return TTI.isLegalMaskedGather(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine can represent \p V as a masked gather 1495 /// or scatter operation. 1496 bool isLegalGatherOrScatter(Value *V) { 1497 bool LI = isa<LoadInst>(V); 1498 bool SI = isa<StoreInst>(V); 1499 if (!LI && !SI) 1500 return false; 1501 auto *Ty = getMemInstValueType(V); 1502 Align Align = getLoadStoreAlignment(V); 1503 return (LI && isLegalMaskedGather(Ty, Align)) || 1504 (SI && isLegalMaskedScatter(Ty, Align)); 1505 } 1506 1507 /// Returns true if \p I is an instruction that will be scalarized with 1508 /// predication. Such instructions include conditional stores and 1509 /// instructions that may divide by zero. 1510 /// If a non-zero VF has been calculated, we check if I will be scalarized 1511 /// predication for that VF. 1512 bool isScalarWithPredication(Instruction *I, 1513 ElementCount VF = ElementCount::getFixed(1)); 1514 1515 // Returns true if \p I is an instruction that will be predicated either 1516 // through scalar predication or masked load/store or masked gather/scatter. 1517 // Superset of instructions that return true for isScalarWithPredication. 1518 bool isPredicatedInst(Instruction *I) { 1519 if (!blockNeedsPredication(I->getParent())) 1520 return false; 1521 // Loads and stores that need some form of masked operation are predicated 1522 // instructions. 1523 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1524 return Legal->isMaskRequired(I); 1525 return isScalarWithPredication(I); 1526 } 1527 1528 /// Returns true if \p I is a memory instruction with consecutive memory 1529 /// access that can be widened. 1530 bool 1531 memoryInstructionCanBeWidened(Instruction *I, 1532 ElementCount VF = ElementCount::getFixed(1)); 1533 1534 /// Returns true if \p I is a memory instruction in an interleaved-group 1535 /// of memory accesses that can be vectorized with wide vector loads/stores 1536 /// and shuffles. 1537 bool 1538 interleavedAccessCanBeWidened(Instruction *I, 1539 ElementCount VF = ElementCount::getFixed(1)); 1540 1541 /// Check if \p Instr belongs to any interleaved access group. 1542 bool isAccessInterleaved(Instruction *Instr) { 1543 return InterleaveInfo.isInterleaved(Instr); 1544 } 1545 1546 /// Get the interleaved access group that \p Instr belongs to. 1547 const InterleaveGroup<Instruction> * 1548 getInterleavedAccessGroup(Instruction *Instr) { 1549 return InterleaveInfo.getInterleaveGroup(Instr); 1550 } 1551 1552 /// Returns true if we're required to use a scalar epilogue for at least 1553 /// the final iteration of the original loop. 1554 bool requiresScalarEpilogue() const { 1555 if (!isScalarEpilogueAllowed()) 1556 return false; 1557 // If we might exit from anywhere but the latch, must run the exiting 1558 // iteration in scalar form. 1559 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1560 return true; 1561 return InterleaveInfo.requiresScalarEpilogue(); 1562 } 1563 1564 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1565 /// loop hint annotation. 1566 bool isScalarEpilogueAllowed() const { 1567 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1568 } 1569 1570 /// Returns true if all loop blocks should be masked to fold tail loop. 1571 bool foldTailByMasking() const { return FoldTailByMasking; } 1572 1573 bool blockNeedsPredication(BasicBlock *BB) { 1574 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1575 } 1576 1577 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1578 /// nodes to the chain of instructions representing the reductions. Uses a 1579 /// MapVector to ensure deterministic iteration order. 1580 using ReductionChainMap = 1581 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1582 1583 /// Return the chain of instructions representing an inloop reduction. 1584 const ReductionChainMap &getInLoopReductionChains() const { 1585 return InLoopReductionChains; 1586 } 1587 1588 /// Returns true if the Phi is part of an inloop reduction. 1589 bool isInLoopReduction(PHINode *Phi) const { 1590 return InLoopReductionChains.count(Phi); 1591 } 1592 1593 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1594 /// with factor VF. Return the cost of the instruction, including 1595 /// scalarization overhead if it's needed. 1596 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1597 1598 /// Estimate cost of a call instruction CI if it were vectorized with factor 1599 /// VF. Return the cost of the instruction, including scalarization overhead 1600 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1601 /// scalarized - 1602 /// i.e. either vector version isn't available, or is too expensive. 1603 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1604 bool &NeedToScalarize); 1605 1606 /// Invalidates decisions already taken by the cost model. 1607 void invalidateCostModelingDecisions() { 1608 WideningDecisions.clear(); 1609 Uniforms.clear(); 1610 Scalars.clear(); 1611 } 1612 1613 private: 1614 unsigned NumPredStores = 0; 1615 1616 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1617 /// than zero. One is returned if vectorization should best be avoided due 1618 /// to cost. 1619 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1620 ElementCount UserVF); 1621 1622 /// The vectorization cost is a combination of the cost itself and a boolean 1623 /// indicating whether any of the contributing operations will actually 1624 /// operate on 1625 /// vector values after type legalization in the backend. If this latter value 1626 /// is 1627 /// false, then all operations will be scalarized (i.e. no vectorization has 1628 /// actually taken place). 1629 using VectorizationCostTy = std::pair<unsigned, bool>; 1630 1631 /// Returns the expected execution cost. The unit of the cost does 1632 /// not matter because we use the 'cost' units to compare different 1633 /// vector widths. The cost that is returned is *not* normalized by 1634 /// the factor width. 1635 VectorizationCostTy expectedCost(ElementCount VF); 1636 1637 /// Returns the execution time cost of an instruction for a given vector 1638 /// width. Vector width of one means scalar. 1639 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost-computation logic from getInstructionCost which provides 1642 /// the vector type as an output parameter. 1643 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1644 1645 /// Calculate vectorization cost of memory instruction \p I. 1646 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1647 1648 /// The cost computation for scalarized memory instruction. 1649 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost computation for interleaving group of memory instructions. 1652 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1653 1654 /// The cost computation for Gather/Scatter instruction. 1655 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1656 1657 /// The cost computation for widening instruction \p I with consecutive 1658 /// memory access. 1659 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1662 /// Load: scalar load + broadcast. 1663 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1664 /// element) 1665 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1666 1667 /// Estimate the overhead of scalarizing an instruction. This is a 1668 /// convenience wrapper for the type-based getScalarizationOverhead API. 1669 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1670 1671 /// Returns whether the instruction is a load or store and will be a emitted 1672 /// as a vector operation. 1673 bool isConsecutiveLoadOrStore(Instruction *I); 1674 1675 /// Returns true if an artificially high cost for emulated masked memrefs 1676 /// should be used. 1677 bool useEmulatedMaskMemRefHack(Instruction *I); 1678 1679 /// Map of scalar integer values to the smallest bitwidth they can be legally 1680 /// represented as. The vector equivalents of these values should be truncated 1681 /// to this type. 1682 MapVector<Instruction *, uint64_t> MinBWs; 1683 1684 /// A type representing the costs for instructions if they were to be 1685 /// scalarized rather than vectorized. The entries are Instruction-Cost 1686 /// pairs. 1687 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1688 1689 /// A set containing all BasicBlocks that are known to present after 1690 /// vectorization as a predicated block. 1691 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1692 1693 /// Records whether it is allowed to have the original scalar loop execute at 1694 /// least once. This may be needed as a fallback loop in case runtime 1695 /// aliasing/dependence checks fail, or to handle the tail/remainder 1696 /// iterations when the trip count is unknown or doesn't divide by the VF, 1697 /// or as a peel-loop to handle gaps in interleave-groups. 1698 /// Under optsize and when the trip count is very small we don't allow any 1699 /// iterations to execute in the scalar loop. 1700 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1701 1702 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1703 bool FoldTailByMasking = false; 1704 1705 /// A map holding scalar costs for different vectorization factors. The 1706 /// presence of a cost for an instruction in the mapping indicates that the 1707 /// instruction will be scalarized when vectorizing with the associated 1708 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1709 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1710 1711 /// Holds the instructions known to be uniform after vectorization. 1712 /// The data is collected per VF. 1713 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1714 1715 /// Holds the instructions known to be scalar after vectorization. 1716 /// The data is collected per VF. 1717 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1718 1719 /// Holds the instructions (address computations) that are forced to be 1720 /// scalarized. 1721 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1722 1723 /// PHINodes of the reductions that should be expanded in-loop along with 1724 /// their associated chains of reduction operations, in program order from top 1725 /// (PHI) to bottom 1726 ReductionChainMap InLoopReductionChains; 1727 1728 /// Returns the expected difference in cost from scalarizing the expression 1729 /// feeding a predicated instruction \p PredInst. The instructions to 1730 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1731 /// non-negative return value implies the expression will be scalarized. 1732 /// Currently, only single-use chains are considered for scalarization. 1733 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1734 ElementCount VF); 1735 1736 /// Collect the instructions that are uniform after vectorization. An 1737 /// instruction is uniform if we represent it with a single scalar value in 1738 /// the vectorized loop corresponding to each vector iteration. Examples of 1739 /// uniform instructions include pointer operands of consecutive or 1740 /// interleaved memory accesses. Note that although uniformity implies an 1741 /// instruction will be scalar, the reverse is not true. In general, a 1742 /// scalarized instruction will be represented by VF scalar values in the 1743 /// vectorized loop, each corresponding to an iteration of the original 1744 /// scalar loop. 1745 void collectLoopUniforms(ElementCount VF); 1746 1747 /// Collect the instructions that are scalar after vectorization. An 1748 /// instruction is scalar if it is known to be uniform or will be scalarized 1749 /// during vectorization. Non-uniform scalarized instructions will be 1750 /// represented by VF values in the vectorized loop, each corresponding to an 1751 /// iteration of the original scalar loop. 1752 void collectLoopScalars(ElementCount VF); 1753 1754 /// Keeps cost model vectorization decision and cost for instructions. 1755 /// Right now it is used for memory instructions only. 1756 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1757 std::pair<InstWidening, unsigned>>; 1758 1759 DecisionList WideningDecisions; 1760 1761 /// Returns true if \p V is expected to be vectorized and it needs to be 1762 /// extracted. 1763 bool needsExtract(Value *V, ElementCount VF) const { 1764 Instruction *I = dyn_cast<Instruction>(V); 1765 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1766 TheLoop->isLoopInvariant(I)) 1767 return false; 1768 1769 // Assume we can vectorize V (and hence we need extraction) if the 1770 // scalars are not computed yet. This can happen, because it is called 1771 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1772 // the scalars are collected. That should be a safe assumption in most 1773 // cases, because we check if the operands have vectorizable types 1774 // beforehand in LoopVectorizationLegality. 1775 return Scalars.find(VF) == Scalars.end() || 1776 !isScalarAfterVectorization(I, VF); 1777 }; 1778 1779 /// Returns a range containing only operands needing to be extracted. 1780 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1781 ElementCount VF) { 1782 return SmallVector<Value *, 4>(make_filter_range( 1783 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1784 } 1785 1786 /// Determines if we have the infrastructure to vectorize loop \p L and its 1787 /// epilogue, assuming the main loop is vectorized by \p VF. 1788 bool isCandidateForEpilogueVectorization(const Loop &L, 1789 const ElementCount VF) const; 1790 1791 /// Returns true if epilogue vectorization is considered profitable, and 1792 /// false otherwise. 1793 /// \p VF is the vectorization factor chosen for the original loop. 1794 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1795 1796 public: 1797 /// The loop that we evaluate. 1798 Loop *TheLoop; 1799 1800 /// Predicated scalar evolution analysis. 1801 PredicatedScalarEvolution &PSE; 1802 1803 /// Loop Info analysis. 1804 LoopInfo *LI; 1805 1806 /// Vectorization legality. 1807 LoopVectorizationLegality *Legal; 1808 1809 /// Vector target information. 1810 const TargetTransformInfo &TTI; 1811 1812 /// Target Library Info. 1813 const TargetLibraryInfo *TLI; 1814 1815 /// Demanded bits analysis. 1816 DemandedBits *DB; 1817 1818 /// Assumption cache. 1819 AssumptionCache *AC; 1820 1821 /// Interface to emit optimization remarks. 1822 OptimizationRemarkEmitter *ORE; 1823 1824 const Function *TheFunction; 1825 1826 /// Loop Vectorize Hint. 1827 const LoopVectorizeHints *Hints; 1828 1829 /// The interleave access information contains groups of interleaved accesses 1830 /// with the same stride and close to each other. 1831 InterleavedAccessInfo &InterleaveInfo; 1832 1833 /// Values to ignore in the cost model. 1834 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1835 1836 /// Values to ignore in the cost model when VF > 1. 1837 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1838 1839 /// Profitable vector factors. 1840 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1841 }; 1842 1843 } // end namespace llvm 1844 1845 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1846 // vectorization. The loop needs to be annotated with #pragma omp simd 1847 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1848 // vector length information is not provided, vectorization is not considered 1849 // explicit. Interleave hints are not allowed either. These limitations will be 1850 // relaxed in the future. 1851 // Please, note that we are currently forced to abuse the pragma 'clang 1852 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1853 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1854 // provides *explicit vectorization hints* (LV can bypass legal checks and 1855 // assume that vectorization is legal). However, both hints are implemented 1856 // using the same metadata (llvm.loop.vectorize, processed by 1857 // LoopVectorizeHints). This will be fixed in the future when the native IR 1858 // representation for pragma 'omp simd' is introduced. 1859 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1860 OptimizationRemarkEmitter *ORE) { 1861 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1862 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1863 1864 // Only outer loops with an explicit vectorization hint are supported. 1865 // Unannotated outer loops are ignored. 1866 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1867 return false; 1868 1869 Function *Fn = OuterLp->getHeader()->getParent(); 1870 if (!Hints.allowVectorization(Fn, OuterLp, 1871 true /*VectorizeOnlyWhenForced*/)) { 1872 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1873 return false; 1874 } 1875 1876 if (Hints.getInterleave() > 1) { 1877 // TODO: Interleave support is future work. 1878 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1879 "outer loops.\n"); 1880 Hints.emitRemarkWithHints(); 1881 return false; 1882 } 1883 1884 return true; 1885 } 1886 1887 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1888 OptimizationRemarkEmitter *ORE, 1889 SmallVectorImpl<Loop *> &V) { 1890 // Collect inner loops and outer loops without irreducible control flow. For 1891 // now, only collect outer loops that have explicit vectorization hints. If we 1892 // are stress testing the VPlan H-CFG construction, we collect the outermost 1893 // loop of every loop nest. 1894 if (L.isInnermost() || VPlanBuildStressTest || 1895 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1896 LoopBlocksRPO RPOT(&L); 1897 RPOT.perform(LI); 1898 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1899 V.push_back(&L); 1900 // TODO: Collect inner loops inside marked outer loops in case 1901 // vectorization fails for the outer loop. Do not invoke 1902 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1903 // already known to be reducible. We can use an inherited attribute for 1904 // that. 1905 return; 1906 } 1907 } 1908 for (Loop *InnerL : L) 1909 collectSupportedLoops(*InnerL, LI, ORE, V); 1910 } 1911 1912 namespace { 1913 1914 /// The LoopVectorize Pass. 1915 struct LoopVectorize : public FunctionPass { 1916 /// Pass identification, replacement for typeid 1917 static char ID; 1918 1919 LoopVectorizePass Impl; 1920 1921 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1922 bool VectorizeOnlyWhenForced = false) 1923 : FunctionPass(ID), 1924 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1925 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1926 } 1927 1928 bool runOnFunction(Function &F) override { 1929 if (skipFunction(F)) 1930 return false; 1931 1932 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1933 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1934 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1935 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1936 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1937 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1938 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1939 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1940 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1941 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1942 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1943 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1944 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1945 1946 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1947 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1948 1949 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1950 GetLAA, *ORE, PSI).MadeAnyChange; 1951 } 1952 1953 void getAnalysisUsage(AnalysisUsage &AU) const override { 1954 AU.addRequired<AssumptionCacheTracker>(); 1955 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1956 AU.addRequired<DominatorTreeWrapperPass>(); 1957 AU.addRequired<LoopInfoWrapperPass>(); 1958 AU.addRequired<ScalarEvolutionWrapperPass>(); 1959 AU.addRequired<TargetTransformInfoWrapperPass>(); 1960 AU.addRequired<AAResultsWrapperPass>(); 1961 AU.addRequired<LoopAccessLegacyAnalysis>(); 1962 AU.addRequired<DemandedBitsWrapperPass>(); 1963 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1964 AU.addRequired<InjectTLIMappingsLegacy>(); 1965 1966 // We currently do not preserve loopinfo/dominator analyses with outer loop 1967 // vectorization. Until this is addressed, mark these analyses as preserved 1968 // only for non-VPlan-native path. 1969 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1970 if (!EnableVPlanNativePath) { 1971 AU.addPreserved<LoopInfoWrapperPass>(); 1972 AU.addPreserved<DominatorTreeWrapperPass>(); 1973 } 1974 1975 AU.addPreserved<BasicAAWrapperPass>(); 1976 AU.addPreserved<GlobalsAAWrapperPass>(); 1977 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1978 } 1979 }; 1980 1981 } // end anonymous namespace 1982 1983 //===----------------------------------------------------------------------===// 1984 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1985 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1986 //===----------------------------------------------------------------------===// 1987 1988 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1989 // We need to place the broadcast of invariant variables outside the loop, 1990 // but only if it's proven safe to do so. Else, broadcast will be inside 1991 // vector loop body. 1992 Instruction *Instr = dyn_cast<Instruction>(V); 1993 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1994 (!Instr || 1995 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1996 // Place the code for broadcasting invariant variables in the new preheader. 1997 IRBuilder<>::InsertPointGuard Guard(Builder); 1998 if (SafeToHoist) 1999 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2000 2001 // Broadcast the scalar into all locations in the vector. 2002 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2003 2004 return Shuf; 2005 } 2006 2007 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2008 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 2009 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2010 "Expected either an induction phi-node or a truncate of it!"); 2011 Value *Start = II.getStartValue(); 2012 2013 // Construct the initial value of the vector IV in the vector loop preheader 2014 auto CurrIP = Builder.saveIP(); 2015 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2016 if (isa<TruncInst>(EntryVal)) { 2017 assert(Start->getType()->isIntegerTy() && 2018 "Truncation requires an integer type"); 2019 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2020 Step = Builder.CreateTrunc(Step, TruncType); 2021 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2022 } 2023 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2024 Value *SteppedStart = 2025 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2026 2027 // We create vector phi nodes for both integer and floating-point induction 2028 // variables. Here, we determine the kind of arithmetic we will perform. 2029 Instruction::BinaryOps AddOp; 2030 Instruction::BinaryOps MulOp; 2031 if (Step->getType()->isIntegerTy()) { 2032 AddOp = Instruction::Add; 2033 MulOp = Instruction::Mul; 2034 } else { 2035 AddOp = II.getInductionOpcode(); 2036 MulOp = Instruction::FMul; 2037 } 2038 2039 // Multiply the vectorization factor by the step using integer or 2040 // floating-point arithmetic as appropriate. 2041 Value *ConstVF = 2042 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2043 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2044 2045 // Create a vector splat to use in the induction update. 2046 // 2047 // FIXME: If the step is non-constant, we create the vector splat with 2048 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2049 // handle a constant vector splat. 2050 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2051 Value *SplatVF = isa<Constant>(Mul) 2052 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2053 : Builder.CreateVectorSplat(VF, Mul); 2054 Builder.restoreIP(CurrIP); 2055 2056 // We may need to add the step a number of times, depending on the unroll 2057 // factor. The last of those goes into the PHI. 2058 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2059 &*LoopVectorBody->getFirstInsertionPt()); 2060 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2061 Instruction *LastInduction = VecInd; 2062 for (unsigned Part = 0; Part < UF; ++Part) { 2063 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2064 2065 if (isa<TruncInst>(EntryVal)) 2066 addMetadata(LastInduction, EntryVal); 2067 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2068 2069 LastInduction = cast<Instruction>(addFastMathFlag( 2070 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2071 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2072 } 2073 2074 // Move the last step to the end of the latch block. This ensures consistent 2075 // placement of all induction updates. 2076 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2077 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2078 auto *ICmp = cast<Instruction>(Br->getCondition()); 2079 LastInduction->moveBefore(ICmp); 2080 LastInduction->setName("vec.ind.next"); 2081 2082 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2083 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2084 } 2085 2086 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2087 return Cost->isScalarAfterVectorization(I, VF) || 2088 Cost->isProfitableToScalarize(I, VF); 2089 } 2090 2091 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2092 if (shouldScalarizeInstruction(IV)) 2093 return true; 2094 auto isScalarInst = [&](User *U) -> bool { 2095 auto *I = cast<Instruction>(U); 2096 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2097 }; 2098 return llvm::any_of(IV->users(), isScalarInst); 2099 } 2100 2101 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2102 const InductionDescriptor &ID, const Instruction *EntryVal, 2103 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2104 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2105 "Expected either an induction phi-node or a truncate of it!"); 2106 2107 // This induction variable is not the phi from the original loop but the 2108 // newly-created IV based on the proof that casted Phi is equal to the 2109 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2110 // re-uses the same InductionDescriptor that original IV uses but we don't 2111 // have to do any recording in this case - that is done when original IV is 2112 // processed. 2113 if (isa<TruncInst>(EntryVal)) 2114 return; 2115 2116 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2117 if (Casts.empty()) 2118 return; 2119 // Only the first Cast instruction in the Casts vector is of interest. 2120 // The rest of the Casts (if exist) have no uses outside the 2121 // induction update chain itself. 2122 Instruction *CastInst = *Casts.begin(); 2123 if (Lane < UINT_MAX) 2124 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2125 else 2126 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2127 } 2128 2129 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 2130 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2131 "Primary induction variable must have an integer type"); 2132 2133 auto II = Legal->getInductionVars().find(IV); 2134 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2135 2136 auto ID = II->second; 2137 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2138 2139 // The value from the original loop to which we are mapping the new induction 2140 // variable. 2141 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2142 2143 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2144 2145 // Generate code for the induction step. Note that induction steps are 2146 // required to be loop-invariant 2147 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2148 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2149 "Induction step should be loop invariant"); 2150 if (PSE.getSE()->isSCEVable(IV->getType())) { 2151 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2152 return Exp.expandCodeFor(Step, Step->getType(), 2153 LoopVectorPreHeader->getTerminator()); 2154 } 2155 return cast<SCEVUnknown>(Step)->getValue(); 2156 }; 2157 2158 // The scalar value to broadcast. This is derived from the canonical 2159 // induction variable. If a truncation type is given, truncate the canonical 2160 // induction variable and step. Otherwise, derive these values from the 2161 // induction descriptor. 2162 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2163 Value *ScalarIV = Induction; 2164 if (IV != OldInduction) { 2165 ScalarIV = IV->getType()->isIntegerTy() 2166 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2167 : Builder.CreateCast(Instruction::SIToFP, Induction, 2168 IV->getType()); 2169 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2170 ScalarIV->setName("offset.idx"); 2171 } 2172 if (Trunc) { 2173 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2174 assert(Step->getType()->isIntegerTy() && 2175 "Truncation requires an integer step"); 2176 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2177 Step = Builder.CreateTrunc(Step, TruncType); 2178 } 2179 return ScalarIV; 2180 }; 2181 2182 // Create the vector values from the scalar IV, in the absence of creating a 2183 // vector IV. 2184 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2185 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2186 for (unsigned Part = 0; Part < UF; ++Part) { 2187 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2188 Value *EntryPart = 2189 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2190 ID.getInductionOpcode()); 2191 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2192 if (Trunc) 2193 addMetadata(EntryPart, Trunc); 2194 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2195 } 2196 }; 2197 2198 // Now do the actual transformations, and start with creating the step value. 2199 Value *Step = CreateStepValue(ID.getStep()); 2200 if (VF.isZero() || VF.isScalar()) { 2201 Value *ScalarIV = CreateScalarIV(Step); 2202 CreateSplatIV(ScalarIV, Step); 2203 return; 2204 } 2205 2206 // Determine if we want a scalar version of the induction variable. This is 2207 // true if the induction variable itself is not widened, or if it has at 2208 // least one user in the loop that is not widened. 2209 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2210 if (!NeedsScalarIV) { 2211 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2212 return; 2213 } 2214 2215 // Try to create a new independent vector induction variable. If we can't 2216 // create the phi node, we will splat the scalar induction variable in each 2217 // loop iteration. 2218 if (!shouldScalarizeInstruction(EntryVal)) { 2219 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2220 Value *ScalarIV = CreateScalarIV(Step); 2221 // Create scalar steps that can be used by instructions we will later 2222 // scalarize. Note that the addition of the scalar steps will not increase 2223 // the number of instructions in the loop in the common case prior to 2224 // InstCombine. We will be trading one vector extract for each scalar step. 2225 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2226 return; 2227 } 2228 2229 // All IV users are scalar instructions, so only emit a scalar IV, not a 2230 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2231 // predicate used by the masked loads/stores. 2232 Value *ScalarIV = CreateScalarIV(Step); 2233 if (!Cost->isScalarEpilogueAllowed()) 2234 CreateSplatIV(ScalarIV, Step); 2235 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2236 } 2237 2238 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2239 Instruction::BinaryOps BinOp) { 2240 // Create and check the types. 2241 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2242 int VLen = ValVTy->getNumElements(); 2243 2244 Type *STy = Val->getType()->getScalarType(); 2245 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2246 "Induction Step must be an integer or FP"); 2247 assert(Step->getType() == STy && "Step has wrong type"); 2248 2249 SmallVector<Constant *, 8> Indices; 2250 2251 if (STy->isIntegerTy()) { 2252 // Create a vector of consecutive numbers from zero to VF. 2253 for (int i = 0; i < VLen; ++i) 2254 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2255 2256 // Add the consecutive indices to the vector value. 2257 Constant *Cv = ConstantVector::get(Indices); 2258 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2259 Step = Builder.CreateVectorSplat(VLen, Step); 2260 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2261 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2262 // which can be found from the original scalar operations. 2263 Step = Builder.CreateMul(Cv, Step); 2264 return Builder.CreateAdd(Val, Step, "induction"); 2265 } 2266 2267 // Floating point induction. 2268 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2269 "Binary Opcode should be specified for FP induction"); 2270 // Create a vector of consecutive numbers from zero to VF. 2271 for (int i = 0; i < VLen; ++i) 2272 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2273 2274 // Add the consecutive indices to the vector value. 2275 Constant *Cv = ConstantVector::get(Indices); 2276 2277 Step = Builder.CreateVectorSplat(VLen, Step); 2278 2279 // Floating point operations had to be 'fast' to enable the induction. 2280 FastMathFlags Flags; 2281 Flags.setFast(); 2282 2283 Value *MulOp = Builder.CreateFMul(Cv, Step); 2284 if (isa<Instruction>(MulOp)) 2285 // Have to check, MulOp may be a constant 2286 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2287 2288 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2289 if (isa<Instruction>(BOp)) 2290 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2291 return BOp; 2292 } 2293 2294 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2295 Instruction *EntryVal, 2296 const InductionDescriptor &ID) { 2297 // We shouldn't have to build scalar steps if we aren't vectorizing. 2298 assert(VF.isVector() && "VF should be greater than one"); 2299 // Get the value type and ensure it and the step have the same integer type. 2300 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2301 assert(ScalarIVTy == Step->getType() && 2302 "Val and Step should have the same type"); 2303 2304 // We build scalar steps for both integer and floating-point induction 2305 // variables. Here, we determine the kind of arithmetic we will perform. 2306 Instruction::BinaryOps AddOp; 2307 Instruction::BinaryOps MulOp; 2308 if (ScalarIVTy->isIntegerTy()) { 2309 AddOp = Instruction::Add; 2310 MulOp = Instruction::Mul; 2311 } else { 2312 AddOp = ID.getInductionOpcode(); 2313 MulOp = Instruction::FMul; 2314 } 2315 2316 // Determine the number of scalars we need to generate for each unroll 2317 // iteration. If EntryVal is uniform, we only need to generate the first 2318 // lane. Otherwise, we generate all VF values. 2319 unsigned Lanes = 2320 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2321 ? 1 2322 : VF.getKnownMinValue(); 2323 assert((!VF.isScalable() || Lanes == 1) && 2324 "Should never scalarize a scalable vector"); 2325 // Compute the scalar steps and save the results in VectorLoopValueMap. 2326 for (unsigned Part = 0; Part < UF; ++Part) { 2327 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2328 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2329 ScalarIVTy->getScalarSizeInBits()); 2330 Value *StartIdx = 2331 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2332 if (ScalarIVTy->isFloatingPointTy()) 2333 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2334 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2335 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2336 // The step returned by `createStepForVF` is a runtime-evaluated value 2337 // when VF is scalable. Otherwise, it should be folded into a Constant. 2338 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2339 "Expected StartIdx to be folded to a constant when VF is not " 2340 "scalable"); 2341 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2342 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2343 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2344 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2345 } 2346 } 2347 } 2348 2349 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2350 assert(V != Induction && "The new induction variable should not be used."); 2351 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2352 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2353 2354 // If we have a stride that is replaced by one, do it here. Defer this for 2355 // the VPlan-native path until we start running Legal checks in that path. 2356 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2357 V = ConstantInt::get(V->getType(), 1); 2358 2359 // If we have a vector mapped to this value, return it. 2360 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2361 return VectorLoopValueMap.getVectorValue(V, Part); 2362 2363 // If the value has not been vectorized, check if it has been scalarized 2364 // instead. If it has been scalarized, and we actually need the value in 2365 // vector form, we will construct the vector values on demand. 2366 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2367 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2368 2369 // If we've scalarized a value, that value should be an instruction. 2370 auto *I = cast<Instruction>(V); 2371 2372 // If we aren't vectorizing, we can just copy the scalar map values over to 2373 // the vector map. 2374 if (VF.isScalar()) { 2375 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2376 return ScalarValue; 2377 } 2378 2379 // Get the last scalar instruction we generated for V and Part. If the value 2380 // is known to be uniform after vectorization, this corresponds to lane zero 2381 // of the Part unroll iteration. Otherwise, the last instruction is the one 2382 // we created for the last vector lane of the Part unroll iteration. 2383 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2384 ? 0 2385 : VF.getKnownMinValue() - 1; 2386 assert((!VF.isScalable() || LastLane == 0) && 2387 "Scalable vectorization can't lead to any scalarized values."); 2388 auto *LastInst = cast<Instruction>( 2389 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2390 2391 // Set the insert point after the last scalarized instruction. This ensures 2392 // the insertelement sequence will directly follow the scalar definitions. 2393 auto OldIP = Builder.saveIP(); 2394 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2395 Builder.SetInsertPoint(&*NewIP); 2396 2397 // However, if we are vectorizing, we need to construct the vector values. 2398 // If the value is known to be uniform after vectorization, we can just 2399 // broadcast the scalar value corresponding to lane zero for each unroll 2400 // iteration. Otherwise, we construct the vector values using insertelement 2401 // instructions. Since the resulting vectors are stored in 2402 // VectorLoopValueMap, we will only generate the insertelements once. 2403 Value *VectorValue = nullptr; 2404 if (Cost->isUniformAfterVectorization(I, VF)) { 2405 VectorValue = getBroadcastInstrs(ScalarValue); 2406 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2407 } else { 2408 // Initialize packing with insertelements to start from undef. 2409 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2410 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2411 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2412 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2413 packScalarIntoVectorValue(V, {Part, Lane}); 2414 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2415 } 2416 Builder.restoreIP(OldIP); 2417 return VectorValue; 2418 } 2419 2420 // If this scalar is unknown, assume that it is a constant or that it is 2421 // loop invariant. Broadcast V and save the value for future uses. 2422 Value *B = getBroadcastInstrs(V); 2423 VectorLoopValueMap.setVectorValue(V, Part, B); 2424 return B; 2425 } 2426 2427 Value * 2428 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2429 const VPIteration &Instance) { 2430 // If the value is not an instruction contained in the loop, it should 2431 // already be scalar. 2432 if (OrigLoop->isLoopInvariant(V)) 2433 return V; 2434 2435 assert(Instance.Lane > 0 2436 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2437 : true && "Uniform values only have lane zero"); 2438 2439 // If the value from the original loop has not been vectorized, it is 2440 // represented by UF x VF scalar values in the new loop. Return the requested 2441 // scalar value. 2442 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2443 return VectorLoopValueMap.getScalarValue(V, Instance); 2444 2445 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2446 // for the given unroll part. If this entry is not a vector type (i.e., the 2447 // vectorization factor is one), there is no need to generate an 2448 // extractelement instruction. 2449 auto *U = getOrCreateVectorValue(V, Instance.Part); 2450 if (!U->getType()->isVectorTy()) { 2451 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2452 return U; 2453 } 2454 2455 // Otherwise, the value from the original loop has been vectorized and is 2456 // represented by UF vector values. Extract and return the requested scalar 2457 // value from the appropriate vector lane. 2458 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2459 } 2460 2461 void InnerLoopVectorizer::packScalarIntoVectorValue( 2462 Value *V, const VPIteration &Instance) { 2463 assert(V != Induction && "The new induction variable should not be used."); 2464 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2465 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2466 2467 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2468 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2469 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2470 Builder.getInt32(Instance.Lane)); 2471 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2472 } 2473 2474 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2475 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2476 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2477 SmallVector<int, 8> ShuffleMask; 2478 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2479 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2480 2481 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2482 } 2483 2484 // Return whether we allow using masked interleave-groups (for dealing with 2485 // strided loads/stores that reside in predicated blocks, or for dealing 2486 // with gaps). 2487 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2488 // If an override option has been passed in for interleaved accesses, use it. 2489 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2490 return EnableMaskedInterleavedMemAccesses; 2491 2492 return TTI.enableMaskedInterleavedAccessVectorization(); 2493 } 2494 2495 // Try to vectorize the interleave group that \p Instr belongs to. 2496 // 2497 // E.g. Translate following interleaved load group (factor = 3): 2498 // for (i = 0; i < N; i+=3) { 2499 // R = Pic[i]; // Member of index 0 2500 // G = Pic[i+1]; // Member of index 1 2501 // B = Pic[i+2]; // Member of index 2 2502 // ... // do something to R, G, B 2503 // } 2504 // To: 2505 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2506 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2507 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2508 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2509 // 2510 // Or translate following interleaved store group (factor = 3): 2511 // for (i = 0; i < N; i+=3) { 2512 // ... do something to R, G, B 2513 // Pic[i] = R; // Member of index 0 2514 // Pic[i+1] = G; // Member of index 1 2515 // Pic[i+2] = B; // Member of index 2 2516 // } 2517 // To: 2518 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2519 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2520 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2521 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2522 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2523 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2524 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2525 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2526 VPValue *BlockInMask) { 2527 Instruction *Instr = Group->getInsertPos(); 2528 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2529 2530 // Prepare for the vector type of the interleaved load/store. 2531 Type *ScalarTy = getMemInstValueType(Instr); 2532 unsigned InterleaveFactor = Group->getFactor(); 2533 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2534 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2535 2536 // Prepare for the new pointers. 2537 SmallVector<Value *, 2> AddrParts; 2538 unsigned Index = Group->getIndex(Instr); 2539 2540 // TODO: extend the masked interleaved-group support to reversed access. 2541 assert((!BlockInMask || !Group->isReverse()) && 2542 "Reversed masked interleave-group not supported."); 2543 2544 // If the group is reverse, adjust the index to refer to the last vector lane 2545 // instead of the first. We adjust the index from the first vector lane, 2546 // rather than directly getting the pointer for lane VF - 1, because the 2547 // pointer operand of the interleaved access is supposed to be uniform. For 2548 // uniform instructions, we're only required to generate a value for the 2549 // first vector lane in each unroll iteration. 2550 assert(!VF.isScalable() && 2551 "scalable vector reverse operation is not implemented"); 2552 if (Group->isReverse()) 2553 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2554 2555 for (unsigned Part = 0; Part < UF; Part++) { 2556 Value *AddrPart = State.get(Addr, {Part, 0}); 2557 setDebugLocFromInst(Builder, AddrPart); 2558 2559 // Notice current instruction could be any index. Need to adjust the address 2560 // to the member of index 0. 2561 // 2562 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2563 // b = A[i]; // Member of index 0 2564 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2565 // 2566 // E.g. A[i+1] = a; // Member of index 1 2567 // A[i] = b; // Member of index 0 2568 // A[i+2] = c; // Member of index 2 (Current instruction) 2569 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2570 2571 bool InBounds = false; 2572 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2573 InBounds = gep->isInBounds(); 2574 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2575 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2576 2577 // Cast to the vector pointer type. 2578 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2579 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2580 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2581 } 2582 2583 setDebugLocFromInst(Builder, Instr); 2584 Value *UndefVec = UndefValue::get(VecTy); 2585 2586 Value *MaskForGaps = nullptr; 2587 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2588 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2589 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2590 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2591 } 2592 2593 // Vectorize the interleaved load group. 2594 if (isa<LoadInst>(Instr)) { 2595 // For each unroll part, create a wide load for the group. 2596 SmallVector<Value *, 2> NewLoads; 2597 for (unsigned Part = 0; Part < UF; Part++) { 2598 Instruction *NewLoad; 2599 if (BlockInMask || MaskForGaps) { 2600 assert(useMaskedInterleavedAccesses(*TTI) && 2601 "masked interleaved groups are not allowed."); 2602 Value *GroupMask = MaskForGaps; 2603 if (BlockInMask) { 2604 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2605 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2606 Value *ShuffledMask = Builder.CreateShuffleVector( 2607 BlockInMaskPart, 2608 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2609 "interleaved.mask"); 2610 GroupMask = MaskForGaps 2611 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2612 MaskForGaps) 2613 : ShuffledMask; 2614 } 2615 NewLoad = 2616 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2617 GroupMask, UndefVec, "wide.masked.vec"); 2618 } 2619 else 2620 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2621 Group->getAlign(), "wide.vec"); 2622 Group->addMetadata(NewLoad); 2623 NewLoads.push_back(NewLoad); 2624 } 2625 2626 // For each member in the group, shuffle out the appropriate data from the 2627 // wide loads. 2628 unsigned J = 0; 2629 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2630 Instruction *Member = Group->getMember(I); 2631 2632 // Skip the gaps in the group. 2633 if (!Member) 2634 continue; 2635 2636 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2637 auto StrideMask = 2638 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2639 for (unsigned Part = 0; Part < UF; Part++) { 2640 Value *StridedVec = Builder.CreateShuffleVector( 2641 NewLoads[Part], StrideMask, "strided.vec"); 2642 2643 // If this member has different type, cast the result type. 2644 if (Member->getType() != ScalarTy) { 2645 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2646 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2647 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2648 } 2649 2650 if (Group->isReverse()) 2651 StridedVec = reverseVector(StridedVec); 2652 2653 State.set(VPDefs[J], Member, StridedVec, Part); 2654 } 2655 ++J; 2656 } 2657 return; 2658 } 2659 2660 // The sub vector type for current instruction. 2661 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2662 auto *SubVT = VectorType::get(ScalarTy, VF); 2663 2664 // Vectorize the interleaved store group. 2665 for (unsigned Part = 0; Part < UF; Part++) { 2666 // Collect the stored vector from each member. 2667 SmallVector<Value *, 4> StoredVecs; 2668 for (unsigned i = 0; i < InterleaveFactor; i++) { 2669 // Interleaved store group doesn't allow a gap, so each index has a member 2670 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2671 2672 Value *StoredVec = State.get(StoredValues[i], Part); 2673 2674 if (Group->isReverse()) 2675 StoredVec = reverseVector(StoredVec); 2676 2677 // If this member has different type, cast it to a unified type. 2678 2679 if (StoredVec->getType() != SubVT) 2680 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2681 2682 StoredVecs.push_back(StoredVec); 2683 } 2684 2685 // Concatenate all vectors into a wide vector. 2686 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2687 2688 // Interleave the elements in the wide vector. 2689 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2690 Value *IVec = Builder.CreateShuffleVector( 2691 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2692 "interleaved.vec"); 2693 2694 Instruction *NewStoreInstr; 2695 if (BlockInMask) { 2696 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2697 Value *ShuffledMask = Builder.CreateShuffleVector( 2698 BlockInMaskPart, 2699 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2700 "interleaved.mask"); 2701 NewStoreInstr = Builder.CreateMaskedStore( 2702 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2703 } 2704 else 2705 NewStoreInstr = 2706 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2707 2708 Group->addMetadata(NewStoreInstr); 2709 } 2710 } 2711 2712 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2713 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2714 VPValue *StoredValue, VPValue *BlockInMask) { 2715 // Attempt to issue a wide load. 2716 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2717 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2718 2719 assert((LI || SI) && "Invalid Load/Store instruction"); 2720 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2721 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2722 2723 LoopVectorizationCostModel::InstWidening Decision = 2724 Cost->getWideningDecision(Instr, VF); 2725 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2726 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2727 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2728 "CM decision is not to widen the memory instruction"); 2729 2730 Type *ScalarDataTy = getMemInstValueType(Instr); 2731 2732 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2733 const Align Alignment = getLoadStoreAlignment(Instr); 2734 2735 // Determine if the pointer operand of the access is either consecutive or 2736 // reverse consecutive. 2737 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2738 bool ConsecutiveStride = 2739 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2740 bool CreateGatherScatter = 2741 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2742 2743 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2744 // gather/scatter. Otherwise Decision should have been to Scalarize. 2745 assert((ConsecutiveStride || CreateGatherScatter) && 2746 "The instruction should be scalarized"); 2747 (void)ConsecutiveStride; 2748 2749 VectorParts BlockInMaskParts(UF); 2750 bool isMaskRequired = BlockInMask; 2751 if (isMaskRequired) 2752 for (unsigned Part = 0; Part < UF; ++Part) 2753 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2754 2755 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2756 // Calculate the pointer for the specific unroll-part. 2757 GetElementPtrInst *PartPtr = nullptr; 2758 2759 bool InBounds = false; 2760 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2761 InBounds = gep->isInBounds(); 2762 2763 if (Reverse) { 2764 assert(!VF.isScalable() && 2765 "Reversing vectors is not yet supported for scalable vectors."); 2766 2767 // If the address is consecutive but reversed, then the 2768 // wide store needs to start at the last vector element. 2769 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2770 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2771 PartPtr->setIsInBounds(InBounds); 2772 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2773 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2774 PartPtr->setIsInBounds(InBounds); 2775 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2776 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2777 } else { 2778 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2779 PartPtr = cast<GetElementPtrInst>( 2780 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2781 PartPtr->setIsInBounds(InBounds); 2782 } 2783 2784 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2785 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2786 }; 2787 2788 // Handle Stores: 2789 if (SI) { 2790 setDebugLocFromInst(Builder, SI); 2791 2792 for (unsigned Part = 0; Part < UF; ++Part) { 2793 Instruction *NewSI = nullptr; 2794 Value *StoredVal = State.get(StoredValue, Part); 2795 if (CreateGatherScatter) { 2796 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2797 Value *VectorGep = State.get(Addr, Part); 2798 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2799 MaskPart); 2800 } else { 2801 if (Reverse) { 2802 // If we store to reverse consecutive memory locations, then we need 2803 // to reverse the order of elements in the stored value. 2804 StoredVal = reverseVector(StoredVal); 2805 // We don't want to update the value in the map as it might be used in 2806 // another expression. So don't call resetVectorValue(StoredVal). 2807 } 2808 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2809 if (isMaskRequired) 2810 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2811 BlockInMaskParts[Part]); 2812 else 2813 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2814 } 2815 addMetadata(NewSI, SI); 2816 } 2817 return; 2818 } 2819 2820 // Handle loads. 2821 assert(LI && "Must have a load instruction"); 2822 setDebugLocFromInst(Builder, LI); 2823 for (unsigned Part = 0; Part < UF; ++Part) { 2824 Value *NewLI; 2825 if (CreateGatherScatter) { 2826 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2827 Value *VectorGep = State.get(Addr, Part); 2828 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2829 nullptr, "wide.masked.gather"); 2830 addMetadata(NewLI, LI); 2831 } else { 2832 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2833 if (isMaskRequired) 2834 NewLI = Builder.CreateMaskedLoad( 2835 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2836 "wide.masked.load"); 2837 else 2838 NewLI = 2839 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2840 2841 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2842 addMetadata(NewLI, LI); 2843 if (Reverse) 2844 NewLI = reverseVector(NewLI); 2845 } 2846 2847 State.set(Def, Instr, NewLI, Part); 2848 } 2849 } 2850 2851 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2852 const VPIteration &Instance, 2853 bool IfPredicateInstr, 2854 VPTransformState &State) { 2855 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2856 2857 setDebugLocFromInst(Builder, Instr); 2858 2859 // Does this instruction return a value ? 2860 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2861 2862 Instruction *Cloned = Instr->clone(); 2863 if (!IsVoidRetTy) 2864 Cloned->setName(Instr->getName() + ".cloned"); 2865 2866 // Replace the operands of the cloned instructions with their scalar 2867 // equivalents in the new loop. 2868 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2869 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2870 auto InputInstance = Instance; 2871 if (!Operand || !OrigLoop->contains(Operand) || 2872 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2873 InputInstance.Lane = 0; 2874 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2875 Cloned->setOperand(op, NewOp); 2876 } 2877 addNewMetadata(Cloned, Instr); 2878 2879 // Place the cloned scalar in the new loop. 2880 Builder.Insert(Cloned); 2881 2882 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2883 // representing scalar values in VPTransformState. Add the cloned scalar to 2884 // the scalar map entry. 2885 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2886 2887 // If we just cloned a new assumption, add it the assumption cache. 2888 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2889 if (II->getIntrinsicID() == Intrinsic::assume) 2890 AC->registerAssumption(II); 2891 2892 // End if-block. 2893 if (IfPredicateInstr) 2894 PredicatedInstructions.push_back(Cloned); 2895 } 2896 2897 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2898 Value *End, Value *Step, 2899 Instruction *DL) { 2900 BasicBlock *Header = L->getHeader(); 2901 BasicBlock *Latch = L->getLoopLatch(); 2902 // As we're just creating this loop, it's possible no latch exists 2903 // yet. If so, use the header as this will be a single block loop. 2904 if (!Latch) 2905 Latch = Header; 2906 2907 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2908 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2909 setDebugLocFromInst(Builder, OldInst); 2910 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2911 2912 Builder.SetInsertPoint(Latch->getTerminator()); 2913 setDebugLocFromInst(Builder, OldInst); 2914 2915 // Create i+1 and fill the PHINode. 2916 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2917 Induction->addIncoming(Start, L->getLoopPreheader()); 2918 Induction->addIncoming(Next, Latch); 2919 // Create the compare. 2920 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2921 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2922 2923 // Now we have two terminators. Remove the old one from the block. 2924 Latch->getTerminator()->eraseFromParent(); 2925 2926 return Induction; 2927 } 2928 2929 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2930 if (TripCount) 2931 return TripCount; 2932 2933 assert(L && "Create Trip Count for null loop."); 2934 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2935 // Find the loop boundaries. 2936 ScalarEvolution *SE = PSE.getSE(); 2937 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2938 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2939 "Invalid loop count"); 2940 2941 Type *IdxTy = Legal->getWidestInductionType(); 2942 assert(IdxTy && "No type for induction"); 2943 2944 // The exit count might have the type of i64 while the phi is i32. This can 2945 // happen if we have an induction variable that is sign extended before the 2946 // compare. The only way that we get a backedge taken count is that the 2947 // induction variable was signed and as such will not overflow. In such a case 2948 // truncation is legal. 2949 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2950 IdxTy->getPrimitiveSizeInBits()) 2951 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2952 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2953 2954 // Get the total trip count from the count by adding 1. 2955 const SCEV *ExitCount = SE->getAddExpr( 2956 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2957 2958 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2959 2960 // Expand the trip count and place the new instructions in the preheader. 2961 // Notice that the pre-header does not change, only the loop body. 2962 SCEVExpander Exp(*SE, DL, "induction"); 2963 2964 // Count holds the overall loop count (N). 2965 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2966 L->getLoopPreheader()->getTerminator()); 2967 2968 if (TripCount->getType()->isPointerTy()) 2969 TripCount = 2970 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2971 L->getLoopPreheader()->getTerminator()); 2972 2973 return TripCount; 2974 } 2975 2976 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2977 if (VectorTripCount) 2978 return VectorTripCount; 2979 2980 Value *TC = getOrCreateTripCount(L); 2981 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2982 2983 Type *Ty = TC->getType(); 2984 // This is where we can make the step a runtime constant. 2985 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2986 2987 // If the tail is to be folded by masking, round the number of iterations N 2988 // up to a multiple of Step instead of rounding down. This is done by first 2989 // adding Step-1 and then rounding down. Note that it's ok if this addition 2990 // overflows: the vector induction variable will eventually wrap to zero given 2991 // that it starts at zero and its Step is a power of two; the loop will then 2992 // exit, with the last early-exit vector comparison also producing all-true. 2993 if (Cost->foldTailByMasking()) { 2994 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2995 "VF*UF must be a power of 2 when folding tail by masking"); 2996 assert(!VF.isScalable() && 2997 "Tail folding not yet supported for scalable vectors"); 2998 TC = Builder.CreateAdd( 2999 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3000 } 3001 3002 // Now we need to generate the expression for the part of the loop that the 3003 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3004 // iterations are not required for correctness, or N - Step, otherwise. Step 3005 // is equal to the vectorization factor (number of SIMD elements) times the 3006 // unroll factor (number of SIMD instructions). 3007 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3008 3009 // There are two cases where we need to ensure (at least) the last iteration 3010 // runs in the scalar remainder loop. Thus, if the step evenly divides 3011 // the trip count, we set the remainder to be equal to the step. If the step 3012 // does not evenly divide the trip count, no adjustment is necessary since 3013 // there will already be scalar iterations. Note that the minimum iterations 3014 // check ensures that N >= Step. The cases are: 3015 // 1) If there is a non-reversed interleaved group that may speculatively 3016 // access memory out-of-bounds. 3017 // 2) If any instruction may follow a conditionally taken exit. That is, if 3018 // the loop contains multiple exiting blocks, or a single exiting block 3019 // which is not the latch. 3020 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3021 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3022 R = Builder.CreateSelect(IsZero, Step, R); 3023 } 3024 3025 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3026 3027 return VectorTripCount; 3028 } 3029 3030 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3031 const DataLayout &DL) { 3032 // Verify that V is a vector type with same number of elements as DstVTy. 3033 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3034 unsigned VF = DstFVTy->getNumElements(); 3035 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3036 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3037 Type *SrcElemTy = SrcVecTy->getElementType(); 3038 Type *DstElemTy = DstFVTy->getElementType(); 3039 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3040 "Vector elements must have same size"); 3041 3042 // Do a direct cast if element types are castable. 3043 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3044 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3045 } 3046 // V cannot be directly casted to desired vector type. 3047 // May happen when V is a floating point vector but DstVTy is a vector of 3048 // pointers or vice-versa. Handle this using a two-step bitcast using an 3049 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3050 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3051 "Only one type should be a pointer type"); 3052 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3053 "Only one type should be a floating point type"); 3054 Type *IntTy = 3055 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3056 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3057 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3058 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3059 } 3060 3061 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3062 BasicBlock *Bypass) { 3063 Value *Count = getOrCreateTripCount(L); 3064 // Reuse existing vector loop preheader for TC checks. 3065 // Note that new preheader block is generated for vector loop. 3066 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3067 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3068 3069 // Generate code to check if the loop's trip count is less than VF * UF, or 3070 // equal to it in case a scalar epilogue is required; this implies that the 3071 // vector trip count is zero. This check also covers the case where adding one 3072 // to the backedge-taken count overflowed leading to an incorrect trip count 3073 // of zero. In this case we will also jump to the scalar loop. 3074 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3075 : ICmpInst::ICMP_ULT; 3076 3077 // If tail is to be folded, vector loop takes care of all iterations. 3078 Value *CheckMinIters = Builder.getFalse(); 3079 if (!Cost->foldTailByMasking()) { 3080 Value *Step = 3081 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3082 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3083 } 3084 // Create new preheader for vector loop. 3085 LoopVectorPreHeader = 3086 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3087 "vector.ph"); 3088 3089 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3090 DT->getNode(Bypass)->getIDom()) && 3091 "TC check is expected to dominate Bypass"); 3092 3093 // Update dominator for Bypass & LoopExit. 3094 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3095 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3096 3097 ReplaceInstWithInst( 3098 TCCheckBlock->getTerminator(), 3099 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3100 LoopBypassBlocks.push_back(TCCheckBlock); 3101 } 3102 3103 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3104 // Reuse existing vector loop preheader for SCEV checks. 3105 // Note that new preheader block is generated for vector loop. 3106 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3107 3108 // Generate the code to check that the SCEV assumptions that we made. 3109 // We want the new basic block to start at the first instruction in a 3110 // sequence of instructions that form a check. 3111 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3112 "scev.check"); 3113 Value *SCEVCheck = Exp.expandCodeForPredicate( 3114 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3115 3116 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3117 if (C->isZero()) 3118 return; 3119 3120 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3121 (OptForSizeBasedOnProfile && 3122 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3123 "Cannot SCEV check stride or overflow when optimizing for size"); 3124 3125 SCEVCheckBlock->setName("vector.scevcheck"); 3126 // Create new preheader for vector loop. 3127 LoopVectorPreHeader = 3128 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3129 nullptr, "vector.ph"); 3130 3131 // Update dominator only if this is first RT check. 3132 if (LoopBypassBlocks.empty()) { 3133 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3134 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3135 } 3136 3137 ReplaceInstWithInst( 3138 SCEVCheckBlock->getTerminator(), 3139 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3140 LoopBypassBlocks.push_back(SCEVCheckBlock); 3141 AddedSafetyChecks = true; 3142 } 3143 3144 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3145 // VPlan-native path does not do any analysis for runtime checks currently. 3146 if (EnableVPlanNativePath) 3147 return; 3148 3149 // Reuse existing vector loop preheader for runtime memory checks. 3150 // Note that new preheader block is generated for vector loop. 3151 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3152 3153 // Generate the code that checks in runtime if arrays overlap. We put the 3154 // checks into a separate block to make the more common case of few elements 3155 // faster. 3156 auto *LAI = Legal->getLAI(); 3157 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3158 if (!RtPtrChecking.Need) 3159 return; 3160 3161 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3162 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3163 "Cannot emit memory checks when optimizing for size, unless forced " 3164 "to vectorize."); 3165 ORE->emit([&]() { 3166 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3167 L->getStartLoc(), L->getHeader()) 3168 << "Code-size may be reduced by not forcing " 3169 "vectorization, or by source-code modifications " 3170 "eliminating the need for runtime checks " 3171 "(e.g., adding 'restrict')."; 3172 }); 3173 } 3174 3175 MemCheckBlock->setName("vector.memcheck"); 3176 // Create new preheader for vector loop. 3177 LoopVectorPreHeader = 3178 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3179 "vector.ph"); 3180 3181 auto *CondBranch = cast<BranchInst>( 3182 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3183 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3184 LoopBypassBlocks.push_back(MemCheckBlock); 3185 AddedSafetyChecks = true; 3186 3187 // Update dominator only if this is first RT check. 3188 if (LoopBypassBlocks.empty()) { 3189 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3190 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3191 } 3192 3193 Instruction *FirstCheckInst; 3194 Instruction *MemRuntimeCheck; 3195 std::tie(FirstCheckInst, MemRuntimeCheck) = 3196 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3197 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3198 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3199 "claimed checks are required"); 3200 CondBranch->setCondition(MemRuntimeCheck); 3201 3202 // We currently don't use LoopVersioning for the actual loop cloning but we 3203 // still use it to add the noalias metadata. 3204 LVer = std::make_unique<LoopVersioning>( 3205 *Legal->getLAI(), 3206 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3207 DT, PSE.getSE()); 3208 LVer->prepareNoAliasMetadata(); 3209 } 3210 3211 Value *InnerLoopVectorizer::emitTransformedIndex( 3212 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3213 const InductionDescriptor &ID) const { 3214 3215 SCEVExpander Exp(*SE, DL, "induction"); 3216 auto Step = ID.getStep(); 3217 auto StartValue = ID.getStartValue(); 3218 assert(Index->getType() == Step->getType() && 3219 "Index type does not match StepValue type"); 3220 3221 // Note: the IR at this point is broken. We cannot use SE to create any new 3222 // SCEV and then expand it, hoping that SCEV's simplification will give us 3223 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3224 // lead to various SCEV crashes. So all we can do is to use builder and rely 3225 // on InstCombine for future simplifications. Here we handle some trivial 3226 // cases only. 3227 auto CreateAdd = [&B](Value *X, Value *Y) { 3228 assert(X->getType() == Y->getType() && "Types don't match!"); 3229 if (auto *CX = dyn_cast<ConstantInt>(X)) 3230 if (CX->isZero()) 3231 return Y; 3232 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3233 if (CY->isZero()) 3234 return X; 3235 return B.CreateAdd(X, Y); 3236 }; 3237 3238 auto CreateMul = [&B](Value *X, Value *Y) { 3239 assert(X->getType() == Y->getType() && "Types don't match!"); 3240 if (auto *CX = dyn_cast<ConstantInt>(X)) 3241 if (CX->isOne()) 3242 return Y; 3243 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3244 if (CY->isOne()) 3245 return X; 3246 return B.CreateMul(X, Y); 3247 }; 3248 3249 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3250 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3251 // the DomTree is not kept up-to-date for additional blocks generated in the 3252 // vector loop. By using the header as insertion point, we guarantee that the 3253 // expanded instructions dominate all their uses. 3254 auto GetInsertPoint = [this, &B]() { 3255 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3256 if (InsertBB != LoopVectorBody && 3257 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3258 return LoopVectorBody->getTerminator(); 3259 return &*B.GetInsertPoint(); 3260 }; 3261 switch (ID.getKind()) { 3262 case InductionDescriptor::IK_IntInduction: { 3263 assert(Index->getType() == StartValue->getType() && 3264 "Index type does not match StartValue type"); 3265 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3266 return B.CreateSub(StartValue, Index); 3267 auto *Offset = CreateMul( 3268 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3269 return CreateAdd(StartValue, Offset); 3270 } 3271 case InductionDescriptor::IK_PtrInduction: { 3272 assert(isa<SCEVConstant>(Step) && 3273 "Expected constant step for pointer induction"); 3274 return B.CreateGEP( 3275 StartValue->getType()->getPointerElementType(), StartValue, 3276 CreateMul(Index, 3277 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3278 } 3279 case InductionDescriptor::IK_FpInduction: { 3280 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3281 auto InductionBinOp = ID.getInductionBinOp(); 3282 assert(InductionBinOp && 3283 (InductionBinOp->getOpcode() == Instruction::FAdd || 3284 InductionBinOp->getOpcode() == Instruction::FSub) && 3285 "Original bin op should be defined for FP induction"); 3286 3287 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3288 3289 // Floating point operations had to be 'fast' to enable the induction. 3290 FastMathFlags Flags; 3291 Flags.setFast(); 3292 3293 Value *MulExp = B.CreateFMul(StepValue, Index); 3294 if (isa<Instruction>(MulExp)) 3295 // We have to check, the MulExp may be a constant. 3296 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3297 3298 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3299 "induction"); 3300 if (isa<Instruction>(BOp)) 3301 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3302 3303 return BOp; 3304 } 3305 case InductionDescriptor::IK_NoInduction: 3306 return nullptr; 3307 } 3308 llvm_unreachable("invalid enum"); 3309 } 3310 3311 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3312 LoopScalarBody = OrigLoop->getHeader(); 3313 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3314 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3315 assert(LoopExitBlock && "Must have an exit block"); 3316 assert(LoopVectorPreHeader && "Invalid loop structure"); 3317 3318 LoopMiddleBlock = 3319 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3320 LI, nullptr, Twine(Prefix) + "middle.block"); 3321 LoopScalarPreHeader = 3322 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3323 nullptr, Twine(Prefix) + "scalar.ph"); 3324 3325 // Set up branch from middle block to the exit and scalar preheader blocks. 3326 // completeLoopSkeleton will update the condition to use an iteration check, 3327 // if required to decide whether to execute the remainder. 3328 BranchInst *BrInst = 3329 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3330 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3331 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3332 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3333 3334 // We intentionally don't let SplitBlock to update LoopInfo since 3335 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3336 // LoopVectorBody is explicitly added to the correct place few lines later. 3337 LoopVectorBody = 3338 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3339 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3340 3341 // Update dominator for loop exit. 3342 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3343 3344 // Create and register the new vector loop. 3345 Loop *Lp = LI->AllocateLoop(); 3346 Loop *ParentLoop = OrigLoop->getParentLoop(); 3347 3348 // Insert the new loop into the loop nest and register the new basic blocks 3349 // before calling any utilities such as SCEV that require valid LoopInfo. 3350 if (ParentLoop) { 3351 ParentLoop->addChildLoop(Lp); 3352 } else { 3353 LI->addTopLevelLoop(Lp); 3354 } 3355 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3356 return Lp; 3357 } 3358 3359 void InnerLoopVectorizer::createInductionResumeValues( 3360 Loop *L, Value *VectorTripCount, 3361 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3362 assert(VectorTripCount && L && "Expected valid arguments"); 3363 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3364 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3365 "Inconsistent information about additional bypass."); 3366 // We are going to resume the execution of the scalar loop. 3367 // Go over all of the induction variables that we found and fix the 3368 // PHIs that are left in the scalar version of the loop. 3369 // The starting values of PHI nodes depend on the counter of the last 3370 // iteration in the vectorized loop. 3371 // If we come from a bypass edge then we need to start from the original 3372 // start value. 3373 for (auto &InductionEntry : Legal->getInductionVars()) { 3374 PHINode *OrigPhi = InductionEntry.first; 3375 InductionDescriptor II = InductionEntry.second; 3376 3377 // Create phi nodes to merge from the backedge-taken check block. 3378 PHINode *BCResumeVal = 3379 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3380 LoopScalarPreHeader->getTerminator()); 3381 // Copy original phi DL over to the new one. 3382 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3383 Value *&EndValue = IVEndValues[OrigPhi]; 3384 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3385 if (OrigPhi == OldInduction) { 3386 // We know what the end value is. 3387 EndValue = VectorTripCount; 3388 } else { 3389 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3390 Type *StepType = II.getStep()->getType(); 3391 Instruction::CastOps CastOp = 3392 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3393 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3394 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3395 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3396 EndValue->setName("ind.end"); 3397 3398 // Compute the end value for the additional bypass (if applicable). 3399 if (AdditionalBypass.first) { 3400 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3401 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3402 StepType, true); 3403 CRD = 3404 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3405 EndValueFromAdditionalBypass = 3406 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3407 EndValueFromAdditionalBypass->setName("ind.end"); 3408 } 3409 } 3410 // The new PHI merges the original incoming value, in case of a bypass, 3411 // or the value at the end of the vectorized loop. 3412 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3413 3414 // Fix the scalar body counter (PHI node). 3415 // The old induction's phi node in the scalar body needs the truncated 3416 // value. 3417 for (BasicBlock *BB : LoopBypassBlocks) 3418 BCResumeVal->addIncoming(II.getStartValue(), BB); 3419 3420 if (AdditionalBypass.first) 3421 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3422 EndValueFromAdditionalBypass); 3423 3424 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3425 } 3426 } 3427 3428 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3429 MDNode *OrigLoopID) { 3430 assert(L && "Expected valid loop."); 3431 3432 // The trip counts should be cached by now. 3433 Value *Count = getOrCreateTripCount(L); 3434 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3435 3436 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3437 3438 // Add a check in the middle block to see if we have completed 3439 // all of the iterations in the first vector loop. 3440 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3441 // If tail is to be folded, we know we don't need to run the remainder. 3442 if (!Cost->foldTailByMasking()) { 3443 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3444 Count, VectorTripCount, "cmp.n", 3445 LoopMiddleBlock->getTerminator()); 3446 3447 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3448 // of the corresponding compare because they may have ended up with 3449 // different line numbers and we want to avoid awkward line stepping while 3450 // debugging. Eg. if the compare has got a line number inside the loop. 3451 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3452 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3453 } 3454 3455 // Get ready to start creating new instructions into the vectorized body. 3456 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3457 "Inconsistent vector loop preheader"); 3458 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3459 3460 Optional<MDNode *> VectorizedLoopID = 3461 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3462 LLVMLoopVectorizeFollowupVectorized}); 3463 if (VectorizedLoopID.hasValue()) { 3464 L->setLoopID(VectorizedLoopID.getValue()); 3465 3466 // Do not setAlreadyVectorized if loop attributes have been defined 3467 // explicitly. 3468 return LoopVectorPreHeader; 3469 } 3470 3471 // Keep all loop hints from the original loop on the vector loop (we'll 3472 // replace the vectorizer-specific hints below). 3473 if (MDNode *LID = OrigLoop->getLoopID()) 3474 L->setLoopID(LID); 3475 3476 LoopVectorizeHints Hints(L, true, *ORE); 3477 Hints.setAlreadyVectorized(); 3478 3479 #ifdef EXPENSIVE_CHECKS 3480 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3481 LI->verify(*DT); 3482 #endif 3483 3484 return LoopVectorPreHeader; 3485 } 3486 3487 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3488 /* 3489 In this function we generate a new loop. The new loop will contain 3490 the vectorized instructions while the old loop will continue to run the 3491 scalar remainder. 3492 3493 [ ] <-- loop iteration number check. 3494 / | 3495 / v 3496 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3497 | / | 3498 | / v 3499 || [ ] <-- vector pre header. 3500 |/ | 3501 | v 3502 | [ ] \ 3503 | [ ]_| <-- vector loop. 3504 | | 3505 | v 3506 | -[ ] <--- middle-block. 3507 | / | 3508 | / v 3509 -|- >[ ] <--- new preheader. 3510 | | 3511 | v 3512 | [ ] \ 3513 | [ ]_| <-- old scalar loop to handle remainder. 3514 \ | 3515 \ v 3516 >[ ] <-- exit block. 3517 ... 3518 */ 3519 3520 // Get the metadata of the original loop before it gets modified. 3521 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3522 3523 // Create an empty vector loop, and prepare basic blocks for the runtime 3524 // checks. 3525 Loop *Lp = createVectorLoopSkeleton(""); 3526 3527 // Now, compare the new count to zero. If it is zero skip the vector loop and 3528 // jump to the scalar loop. This check also covers the case where the 3529 // backedge-taken count is uint##_max: adding one to it will overflow leading 3530 // to an incorrect trip count of zero. In this (rare) case we will also jump 3531 // to the scalar loop. 3532 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3533 3534 // Generate the code to check any assumptions that we've made for SCEV 3535 // expressions. 3536 emitSCEVChecks(Lp, LoopScalarPreHeader); 3537 3538 // Generate the code that checks in runtime if arrays overlap. We put the 3539 // checks into a separate block to make the more common case of few elements 3540 // faster. 3541 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3542 3543 // Some loops have a single integer induction variable, while other loops 3544 // don't. One example is c++ iterators that often have multiple pointer 3545 // induction variables. In the code below we also support a case where we 3546 // don't have a single induction variable. 3547 // 3548 // We try to obtain an induction variable from the original loop as hard 3549 // as possible. However if we don't find one that: 3550 // - is an integer 3551 // - counts from zero, stepping by one 3552 // - is the size of the widest induction variable type 3553 // then we create a new one. 3554 OldInduction = Legal->getPrimaryInduction(); 3555 Type *IdxTy = Legal->getWidestInductionType(); 3556 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3557 // The loop step is equal to the vectorization factor (num of SIMD elements) 3558 // times the unroll factor (num of SIMD instructions). 3559 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3560 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3561 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3562 Induction = 3563 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3564 getDebugLocFromInstOrOperands(OldInduction)); 3565 3566 // Emit phis for the new starting index of the scalar loop. 3567 createInductionResumeValues(Lp, CountRoundDown); 3568 3569 return completeLoopSkeleton(Lp, OrigLoopID); 3570 } 3571 3572 // Fix up external users of the induction variable. At this point, we are 3573 // in LCSSA form, with all external PHIs that use the IV having one input value, 3574 // coming from the remainder loop. We need those PHIs to also have a correct 3575 // value for the IV when arriving directly from the middle block. 3576 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3577 const InductionDescriptor &II, 3578 Value *CountRoundDown, Value *EndValue, 3579 BasicBlock *MiddleBlock) { 3580 // There are two kinds of external IV usages - those that use the value 3581 // computed in the last iteration (the PHI) and those that use the penultimate 3582 // value (the value that feeds into the phi from the loop latch). 3583 // We allow both, but they, obviously, have different values. 3584 3585 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3586 3587 DenseMap<Value *, Value *> MissingVals; 3588 3589 // An external user of the last iteration's value should see the value that 3590 // the remainder loop uses to initialize its own IV. 3591 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3592 for (User *U : PostInc->users()) { 3593 Instruction *UI = cast<Instruction>(U); 3594 if (!OrigLoop->contains(UI)) { 3595 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3596 MissingVals[UI] = EndValue; 3597 } 3598 } 3599 3600 // An external user of the penultimate value need to see EndValue - Step. 3601 // The simplest way to get this is to recompute it from the constituent SCEVs, 3602 // that is Start + (Step * (CRD - 1)). 3603 for (User *U : OrigPhi->users()) { 3604 auto *UI = cast<Instruction>(U); 3605 if (!OrigLoop->contains(UI)) { 3606 const DataLayout &DL = 3607 OrigLoop->getHeader()->getModule()->getDataLayout(); 3608 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3609 3610 IRBuilder<> B(MiddleBlock->getTerminator()); 3611 Value *CountMinusOne = B.CreateSub( 3612 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3613 Value *CMO = 3614 !II.getStep()->getType()->isIntegerTy() 3615 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3616 II.getStep()->getType()) 3617 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3618 CMO->setName("cast.cmo"); 3619 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3620 Escape->setName("ind.escape"); 3621 MissingVals[UI] = Escape; 3622 } 3623 } 3624 3625 for (auto &I : MissingVals) { 3626 PHINode *PHI = cast<PHINode>(I.first); 3627 // One corner case we have to handle is two IVs "chasing" each-other, 3628 // that is %IV2 = phi [...], [ %IV1, %latch ] 3629 // In this case, if IV1 has an external use, we need to avoid adding both 3630 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3631 // don't already have an incoming value for the middle block. 3632 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3633 PHI->addIncoming(I.second, MiddleBlock); 3634 } 3635 } 3636 3637 namespace { 3638 3639 struct CSEDenseMapInfo { 3640 static bool canHandle(const Instruction *I) { 3641 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3642 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3643 } 3644 3645 static inline Instruction *getEmptyKey() { 3646 return DenseMapInfo<Instruction *>::getEmptyKey(); 3647 } 3648 3649 static inline Instruction *getTombstoneKey() { 3650 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3651 } 3652 3653 static unsigned getHashValue(const Instruction *I) { 3654 assert(canHandle(I) && "Unknown instruction!"); 3655 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3656 I->value_op_end())); 3657 } 3658 3659 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3660 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3661 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3662 return LHS == RHS; 3663 return LHS->isIdenticalTo(RHS); 3664 } 3665 }; 3666 3667 } // end anonymous namespace 3668 3669 ///Perform cse of induction variable instructions. 3670 static void cse(BasicBlock *BB) { 3671 // Perform simple cse. 3672 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3673 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3674 Instruction *In = &*I++; 3675 3676 if (!CSEDenseMapInfo::canHandle(In)) 3677 continue; 3678 3679 // Check if we can replace this instruction with any of the 3680 // visited instructions. 3681 if (Instruction *V = CSEMap.lookup(In)) { 3682 In->replaceAllUsesWith(V); 3683 In->eraseFromParent(); 3684 continue; 3685 } 3686 3687 CSEMap[In] = In; 3688 } 3689 } 3690 3691 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3692 ElementCount VF, 3693 bool &NeedToScalarize) { 3694 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3695 Function *F = CI->getCalledFunction(); 3696 Type *ScalarRetTy = CI->getType(); 3697 SmallVector<Type *, 4> Tys, ScalarTys; 3698 for (auto &ArgOp : CI->arg_operands()) 3699 ScalarTys.push_back(ArgOp->getType()); 3700 3701 // Estimate cost of scalarized vector call. The source operands are assumed 3702 // to be vectors, so we need to extract individual elements from there, 3703 // execute VF scalar calls, and then gather the result into the vector return 3704 // value. 3705 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3706 TTI::TCK_RecipThroughput); 3707 if (VF.isScalar()) 3708 return ScalarCallCost; 3709 3710 // Compute corresponding vector type for return value and arguments. 3711 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3712 for (Type *ScalarTy : ScalarTys) 3713 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3714 3715 // Compute costs of unpacking argument values for the scalar calls and 3716 // packing the return values to a vector. 3717 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3718 3719 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3720 3721 // If we can't emit a vector call for this function, then the currently found 3722 // cost is the cost we need to return. 3723 NeedToScalarize = true; 3724 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3725 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3726 3727 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3728 return Cost; 3729 3730 // If the corresponding vector cost is cheaper, return its cost. 3731 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3732 TTI::TCK_RecipThroughput); 3733 if (VectorCallCost < Cost) { 3734 NeedToScalarize = false; 3735 return VectorCallCost; 3736 } 3737 return Cost; 3738 } 3739 3740 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3741 ElementCount VF) { 3742 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3743 assert(ID && "Expected intrinsic call!"); 3744 3745 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3746 return TTI.getIntrinsicInstrCost(CostAttrs, 3747 TargetTransformInfo::TCK_RecipThroughput); 3748 } 3749 3750 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3751 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3752 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3753 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3754 } 3755 3756 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3757 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3758 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3759 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3760 } 3761 3762 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3763 // For every instruction `I` in MinBWs, truncate the operands, create a 3764 // truncated version of `I` and reextend its result. InstCombine runs 3765 // later and will remove any ext/trunc pairs. 3766 SmallPtrSet<Value *, 4> Erased; 3767 for (const auto &KV : Cost->getMinimalBitwidths()) { 3768 // If the value wasn't vectorized, we must maintain the original scalar 3769 // type. The absence of the value from VectorLoopValueMap indicates that it 3770 // wasn't vectorized. 3771 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3772 continue; 3773 for (unsigned Part = 0; Part < UF; ++Part) { 3774 Value *I = getOrCreateVectorValue(KV.first, Part); 3775 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3776 continue; 3777 Type *OriginalTy = I->getType(); 3778 Type *ScalarTruncatedTy = 3779 IntegerType::get(OriginalTy->getContext(), KV.second); 3780 auto *TruncatedTy = FixedVectorType::get( 3781 ScalarTruncatedTy, 3782 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3783 if (TruncatedTy == OriginalTy) 3784 continue; 3785 3786 IRBuilder<> B(cast<Instruction>(I)); 3787 auto ShrinkOperand = [&](Value *V) -> Value * { 3788 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3789 if (ZI->getSrcTy() == TruncatedTy) 3790 return ZI->getOperand(0); 3791 return B.CreateZExtOrTrunc(V, TruncatedTy); 3792 }; 3793 3794 // The actual instruction modification depends on the instruction type, 3795 // unfortunately. 3796 Value *NewI = nullptr; 3797 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3798 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3799 ShrinkOperand(BO->getOperand(1))); 3800 3801 // Any wrapping introduced by shrinking this operation shouldn't be 3802 // considered undefined behavior. So, we can't unconditionally copy 3803 // arithmetic wrapping flags to NewI. 3804 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3805 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3806 NewI = 3807 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3808 ShrinkOperand(CI->getOperand(1))); 3809 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3810 NewI = B.CreateSelect(SI->getCondition(), 3811 ShrinkOperand(SI->getTrueValue()), 3812 ShrinkOperand(SI->getFalseValue())); 3813 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3814 switch (CI->getOpcode()) { 3815 default: 3816 llvm_unreachable("Unhandled cast!"); 3817 case Instruction::Trunc: 3818 NewI = ShrinkOperand(CI->getOperand(0)); 3819 break; 3820 case Instruction::SExt: 3821 NewI = B.CreateSExtOrTrunc( 3822 CI->getOperand(0), 3823 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3824 break; 3825 case Instruction::ZExt: 3826 NewI = B.CreateZExtOrTrunc( 3827 CI->getOperand(0), 3828 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3829 break; 3830 } 3831 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3832 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3833 ->getNumElements(); 3834 auto *O0 = B.CreateZExtOrTrunc( 3835 SI->getOperand(0), 3836 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3837 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3838 ->getNumElements(); 3839 auto *O1 = B.CreateZExtOrTrunc( 3840 SI->getOperand(1), 3841 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3842 3843 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3844 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3845 // Don't do anything with the operands, just extend the result. 3846 continue; 3847 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3848 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3849 ->getNumElements(); 3850 auto *O0 = B.CreateZExtOrTrunc( 3851 IE->getOperand(0), 3852 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3853 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3854 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3855 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3856 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3857 ->getNumElements(); 3858 auto *O0 = B.CreateZExtOrTrunc( 3859 EE->getOperand(0), 3860 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3861 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3862 } else { 3863 // If we don't know what to do, be conservative and don't do anything. 3864 continue; 3865 } 3866 3867 // Lastly, extend the result. 3868 NewI->takeName(cast<Instruction>(I)); 3869 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3870 I->replaceAllUsesWith(Res); 3871 cast<Instruction>(I)->eraseFromParent(); 3872 Erased.insert(I); 3873 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3874 } 3875 } 3876 3877 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3878 for (const auto &KV : Cost->getMinimalBitwidths()) { 3879 // If the value wasn't vectorized, we must maintain the original scalar 3880 // type. The absence of the value from VectorLoopValueMap indicates that it 3881 // wasn't vectorized. 3882 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3883 continue; 3884 for (unsigned Part = 0; Part < UF; ++Part) { 3885 Value *I = getOrCreateVectorValue(KV.first, Part); 3886 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3887 if (Inst && Inst->use_empty()) { 3888 Value *NewI = Inst->getOperand(0); 3889 Inst->eraseFromParent(); 3890 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3891 } 3892 } 3893 } 3894 } 3895 3896 void InnerLoopVectorizer::fixVectorizedLoop() { 3897 // Insert truncates and extends for any truncated instructions as hints to 3898 // InstCombine. 3899 if (VF.isVector()) 3900 truncateToMinimalBitwidths(); 3901 3902 // Fix widened non-induction PHIs by setting up the PHI operands. 3903 if (OrigPHIsToFix.size()) { 3904 assert(EnableVPlanNativePath && 3905 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3906 fixNonInductionPHIs(); 3907 } 3908 3909 // At this point every instruction in the original loop is widened to a 3910 // vector form. Now we need to fix the recurrences in the loop. These PHI 3911 // nodes are currently empty because we did not want to introduce cycles. 3912 // This is the second stage of vectorizing recurrences. 3913 fixCrossIterationPHIs(); 3914 3915 // Forget the original basic block. 3916 PSE.getSE()->forgetLoop(OrigLoop); 3917 3918 // Fix-up external users of the induction variables. 3919 for (auto &Entry : Legal->getInductionVars()) 3920 fixupIVUsers(Entry.first, Entry.second, 3921 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3922 IVEndValues[Entry.first], LoopMiddleBlock); 3923 3924 fixLCSSAPHIs(); 3925 for (Instruction *PI : PredicatedInstructions) 3926 sinkScalarOperands(&*PI); 3927 3928 // Remove redundant induction instructions. 3929 cse(LoopVectorBody); 3930 3931 // Set/update profile weights for the vector and remainder loops as original 3932 // loop iterations are now distributed among them. Note that original loop 3933 // represented by LoopScalarBody becomes remainder loop after vectorization. 3934 // 3935 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3936 // end up getting slightly roughened result but that should be OK since 3937 // profile is not inherently precise anyway. Note also possible bypass of 3938 // vector code caused by legality checks is ignored, assigning all the weight 3939 // to the vector loop, optimistically. 3940 // 3941 // For scalable vectorization we can't know at compile time how many iterations 3942 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3943 // vscale of '1'. 3944 setProfileInfoAfterUnrolling( 3945 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3946 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3947 } 3948 3949 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3950 // In order to support recurrences we need to be able to vectorize Phi nodes. 3951 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3952 // stage #2: We now need to fix the recurrences by adding incoming edges to 3953 // the currently empty PHI nodes. At this point every instruction in the 3954 // original loop is widened to a vector form so we can use them to construct 3955 // the incoming edges. 3956 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3957 // Handle first-order recurrences and reductions that need to be fixed. 3958 if (Legal->isFirstOrderRecurrence(&Phi)) 3959 fixFirstOrderRecurrence(&Phi); 3960 else if (Legal->isReductionVariable(&Phi)) 3961 fixReduction(&Phi); 3962 } 3963 } 3964 3965 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3966 // This is the second phase of vectorizing first-order recurrences. An 3967 // overview of the transformation is described below. Suppose we have the 3968 // following loop. 3969 // 3970 // for (int i = 0; i < n; ++i) 3971 // b[i] = a[i] - a[i - 1]; 3972 // 3973 // There is a first-order recurrence on "a". For this loop, the shorthand 3974 // scalar IR looks like: 3975 // 3976 // scalar.ph: 3977 // s_init = a[-1] 3978 // br scalar.body 3979 // 3980 // scalar.body: 3981 // i = phi [0, scalar.ph], [i+1, scalar.body] 3982 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3983 // s2 = a[i] 3984 // b[i] = s2 - s1 3985 // br cond, scalar.body, ... 3986 // 3987 // In this example, s1 is a recurrence because it's value depends on the 3988 // previous iteration. In the first phase of vectorization, we created a 3989 // temporary value for s1. We now complete the vectorization and produce the 3990 // shorthand vector IR shown below (for VF = 4, UF = 1). 3991 // 3992 // vector.ph: 3993 // v_init = vector(..., ..., ..., a[-1]) 3994 // br vector.body 3995 // 3996 // vector.body 3997 // i = phi [0, vector.ph], [i+4, vector.body] 3998 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3999 // v2 = a[i, i+1, i+2, i+3]; 4000 // v3 = vector(v1(3), v2(0, 1, 2)) 4001 // b[i, i+1, i+2, i+3] = v2 - v3 4002 // br cond, vector.body, middle.block 4003 // 4004 // middle.block: 4005 // x = v2(3) 4006 // br scalar.ph 4007 // 4008 // scalar.ph: 4009 // s_init = phi [x, middle.block], [a[-1], otherwise] 4010 // br scalar.body 4011 // 4012 // After execution completes the vector loop, we extract the next value of 4013 // the recurrence (x) to use as the initial value in the scalar loop. 4014 4015 // Get the original loop preheader and single loop latch. 4016 auto *Preheader = OrigLoop->getLoopPreheader(); 4017 auto *Latch = OrigLoop->getLoopLatch(); 4018 4019 // Get the initial and previous values of the scalar recurrence. 4020 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4021 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4022 4023 // Create a vector from the initial value. 4024 auto *VectorInit = ScalarInit; 4025 if (VF.isVector()) { 4026 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4027 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4028 VectorInit = Builder.CreateInsertElement( 4029 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4030 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4031 } 4032 4033 // We constructed a temporary phi node in the first phase of vectorization. 4034 // This phi node will eventually be deleted. 4035 Builder.SetInsertPoint( 4036 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4037 4038 // Create a phi node for the new recurrence. The current value will either be 4039 // the initial value inserted into a vector or loop-varying vector value. 4040 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4041 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4042 4043 // Get the vectorized previous value of the last part UF - 1. It appears last 4044 // among all unrolled iterations, due to the order of their construction. 4045 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4046 4047 // Find and set the insertion point after the previous value if it is an 4048 // instruction. 4049 BasicBlock::iterator InsertPt; 4050 // Note that the previous value may have been constant-folded so it is not 4051 // guaranteed to be an instruction in the vector loop. 4052 // FIXME: Loop invariant values do not form recurrences. We should deal with 4053 // them earlier. 4054 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4055 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4056 else { 4057 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4058 if (isa<PHINode>(PreviousLastPart)) 4059 // If the previous value is a phi node, we should insert after all the phi 4060 // nodes in the block containing the PHI to avoid breaking basic block 4061 // verification. Note that the basic block may be different to 4062 // LoopVectorBody, in case we predicate the loop. 4063 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4064 else 4065 InsertPt = ++PreviousInst->getIterator(); 4066 } 4067 Builder.SetInsertPoint(&*InsertPt); 4068 4069 // We will construct a vector for the recurrence by combining the values for 4070 // the current and previous iterations. This is the required shuffle mask. 4071 assert(!VF.isScalable()); 4072 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4073 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4074 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4075 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4076 4077 // The vector from which to take the initial value for the current iteration 4078 // (actual or unrolled). Initially, this is the vector phi node. 4079 Value *Incoming = VecPhi; 4080 4081 // Shuffle the current and previous vector and update the vector parts. 4082 for (unsigned Part = 0; Part < UF; ++Part) { 4083 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4084 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4085 auto *Shuffle = 4086 VF.isVector() 4087 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4088 : Incoming; 4089 PhiPart->replaceAllUsesWith(Shuffle); 4090 cast<Instruction>(PhiPart)->eraseFromParent(); 4091 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4092 Incoming = PreviousPart; 4093 } 4094 4095 // Fix the latch value of the new recurrence in the vector loop. 4096 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4097 4098 // Extract the last vector element in the middle block. This will be the 4099 // initial value for the recurrence when jumping to the scalar loop. 4100 auto *ExtractForScalar = Incoming; 4101 if (VF.isVector()) { 4102 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4103 ExtractForScalar = Builder.CreateExtractElement( 4104 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4105 "vector.recur.extract"); 4106 } 4107 // Extract the second last element in the middle block if the 4108 // Phi is used outside the loop. We need to extract the phi itself 4109 // and not the last element (the phi update in the current iteration). This 4110 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4111 // when the scalar loop is not run at all. 4112 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4113 if (VF.isVector()) 4114 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4115 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4116 "vector.recur.extract.for.phi"); 4117 // When loop is unrolled without vectorizing, initialize 4118 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4119 // `Incoming`. This is analogous to the vectorized case above: extracting the 4120 // second last element when VF > 1. 4121 else if (UF > 1) 4122 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4123 4124 // Fix the initial value of the original recurrence in the scalar loop. 4125 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4126 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4127 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4128 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4129 Start->addIncoming(Incoming, BB); 4130 } 4131 4132 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4133 Phi->setName("scalar.recur"); 4134 4135 // Finally, fix users of the recurrence outside the loop. The users will need 4136 // either the last value of the scalar recurrence or the last value of the 4137 // vector recurrence we extracted in the middle block. Since the loop is in 4138 // LCSSA form, we just need to find all the phi nodes for the original scalar 4139 // recurrence in the exit block, and then add an edge for the middle block. 4140 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4141 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4142 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4143 } 4144 } 4145 } 4146 4147 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4148 Constant *Zero = Builder.getInt32(0); 4149 4150 // Get it's reduction variable descriptor. 4151 assert(Legal->isReductionVariable(Phi) && 4152 "Unable to find the reduction variable"); 4153 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4154 4155 RecurKind RK = RdxDesc.getRecurrenceKind(); 4156 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4157 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4158 setDebugLocFromInst(Builder, ReductionStartValue); 4159 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4160 4161 // We need to generate a reduction vector from the incoming scalar. 4162 // To do so, we need to generate the 'identity' vector and override 4163 // one of the elements with the incoming scalar reduction. We need 4164 // to do it in the vector-loop preheader. 4165 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4166 4167 // This is the vector-clone of the value that leaves the loop. 4168 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4169 4170 // Find the reduction identity variable. Zero for addition, or, xor, 4171 // one for multiplication, -1 for And. 4172 Value *Identity; 4173 Value *VectorStart; 4174 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4175 // MinMax reduction have the start value as their identify. 4176 if (VF.isScalar() || IsInLoopReductionPhi) { 4177 VectorStart = Identity = ReductionStartValue; 4178 } else { 4179 VectorStart = Identity = 4180 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4181 } 4182 } else { 4183 // Handle other reduction kinds: 4184 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4185 RK, VecTy->getScalarType()); 4186 if (VF.isScalar() || IsInLoopReductionPhi) { 4187 Identity = Iden; 4188 // This vector is the Identity vector where the first element is the 4189 // incoming scalar reduction. 4190 VectorStart = ReductionStartValue; 4191 } else { 4192 Identity = ConstantVector::getSplat(VF, Iden); 4193 4194 // This vector is the Identity vector where the first element is the 4195 // incoming scalar reduction. 4196 VectorStart = 4197 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4198 } 4199 } 4200 4201 // Wrap flags are in general invalid after vectorization, clear them. 4202 clearReductionWrapFlags(RdxDesc); 4203 4204 // Fix the vector-loop phi. 4205 4206 // Reductions do not have to start at zero. They can start with 4207 // any loop invariant values. 4208 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4209 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4210 4211 for (unsigned Part = 0; Part < UF; ++Part) { 4212 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4213 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4214 // Make sure to add the reduction start value only to the 4215 // first unroll part. 4216 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4217 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4218 cast<PHINode>(VecRdxPhi) 4219 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4220 } 4221 4222 // Before each round, move the insertion point right between 4223 // the PHIs and the values we are going to write. 4224 // This allows us to write both PHINodes and the extractelement 4225 // instructions. 4226 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4227 4228 setDebugLocFromInst(Builder, LoopExitInst); 4229 4230 // If tail is folded by masking, the vector value to leave the loop should be 4231 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4232 // instead of the former. For an inloop reduction the reduction will already 4233 // be predicated, and does not need to be handled here. 4234 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4235 for (unsigned Part = 0; Part < UF; ++Part) { 4236 Value *VecLoopExitInst = 4237 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4238 Value *Sel = nullptr; 4239 for (User *U : VecLoopExitInst->users()) { 4240 if (isa<SelectInst>(U)) { 4241 assert(!Sel && "Reduction exit feeding two selects"); 4242 Sel = U; 4243 } else 4244 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4245 } 4246 assert(Sel && "Reduction exit feeds no select"); 4247 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4248 4249 // If the target can create a predicated operator for the reduction at no 4250 // extra cost in the loop (for example a predicated vadd), it can be 4251 // cheaper for the select to remain in the loop than be sunk out of it, 4252 // and so use the select value for the phi instead of the old 4253 // LoopExitValue. 4254 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4255 if (PreferPredicatedReductionSelect || 4256 TTI->preferPredicatedReductionSelect( 4257 RdxDesc.getRecurrenceBinOp(), Phi->getType(), 4258 TargetTransformInfo::ReductionFlags())) { 4259 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4260 VecRdxPhi->setIncomingValueForBlock( 4261 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4262 } 4263 } 4264 } 4265 4266 // If the vector reduction can be performed in a smaller type, we truncate 4267 // then extend the loop exit value to enable InstCombine to evaluate the 4268 // entire expression in the smaller type. 4269 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4270 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4271 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4272 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4273 Builder.SetInsertPoint( 4274 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4275 VectorParts RdxParts(UF); 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4278 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4279 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4280 : Builder.CreateZExt(Trunc, VecTy); 4281 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4282 UI != RdxParts[Part]->user_end();) 4283 if (*UI != Trunc) { 4284 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4285 RdxParts[Part] = Extnd; 4286 } else { 4287 ++UI; 4288 } 4289 } 4290 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4291 for (unsigned Part = 0; Part < UF; ++Part) { 4292 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4293 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4294 } 4295 } 4296 4297 // Reduce all of the unrolled parts into a single vector. 4298 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4299 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4300 4301 // The middle block terminator has already been assigned a DebugLoc here (the 4302 // OrigLoop's single latch terminator). We want the whole middle block to 4303 // appear to execute on this line because: (a) it is all compiler generated, 4304 // (b) these instructions are always executed after evaluating the latch 4305 // conditional branch, and (c) other passes may add new predecessors which 4306 // terminate on this line. This is the easiest way to ensure we don't 4307 // accidentally cause an extra step back into the loop while debugging. 4308 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4309 for (unsigned Part = 1; Part < UF; ++Part) { 4310 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4311 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4312 // Floating point operations had to be 'fast' to enable the reduction. 4313 ReducedPartRdx = addFastMathFlag( 4314 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4315 ReducedPartRdx, "bin.rdx"), 4316 RdxDesc.getFastMathFlags()); 4317 else 4318 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4319 } 4320 4321 // Create the reduction after the loop. Note that inloop reductions create the 4322 // target reduction in the loop using a Reduction recipe. 4323 if (VF.isVector() && !IsInLoopReductionPhi) { 4324 ReducedPartRdx = 4325 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4326 // If the reduction can be performed in a smaller type, we need to extend 4327 // the reduction to the wider type before we branch to the original loop. 4328 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4329 ReducedPartRdx = 4330 RdxDesc.isSigned() 4331 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4332 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4333 } 4334 4335 // Create a phi node that merges control-flow from the backedge-taken check 4336 // block and the middle block. 4337 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4338 LoopScalarPreHeader->getTerminator()); 4339 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4340 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4341 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4342 4343 // Now, we need to fix the users of the reduction variable 4344 // inside and outside of the scalar remainder loop. 4345 // We know that the loop is in LCSSA form. We need to update the 4346 // PHI nodes in the exit blocks. 4347 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4348 // All PHINodes need to have a single entry edge, or two if 4349 // we already fixed them. 4350 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4351 4352 // We found a reduction value exit-PHI. Update it with the 4353 // incoming bypass edge. 4354 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4355 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4356 } // end of the LCSSA phi scan. 4357 4358 // Fix the scalar loop reduction variable with the incoming reduction sum 4359 // from the vector body and from the backedge value. 4360 int IncomingEdgeBlockIdx = 4361 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4362 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4363 // Pick the other block. 4364 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4365 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4366 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4367 } 4368 4369 void InnerLoopVectorizer::clearReductionWrapFlags( 4370 RecurrenceDescriptor &RdxDesc) { 4371 RecurKind RK = RdxDesc.getRecurrenceKind(); 4372 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4373 return; 4374 4375 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4376 assert(LoopExitInstr && "null loop exit instruction"); 4377 SmallVector<Instruction *, 8> Worklist; 4378 SmallPtrSet<Instruction *, 8> Visited; 4379 Worklist.push_back(LoopExitInstr); 4380 Visited.insert(LoopExitInstr); 4381 4382 while (!Worklist.empty()) { 4383 Instruction *Cur = Worklist.pop_back_val(); 4384 if (isa<OverflowingBinaryOperator>(Cur)) 4385 for (unsigned Part = 0; Part < UF; ++Part) { 4386 Value *V = getOrCreateVectorValue(Cur, Part); 4387 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4388 } 4389 4390 for (User *U : Cur->users()) { 4391 Instruction *UI = cast<Instruction>(U); 4392 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4393 Visited.insert(UI).second) 4394 Worklist.push_back(UI); 4395 } 4396 } 4397 } 4398 4399 void InnerLoopVectorizer::fixLCSSAPHIs() { 4400 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4401 if (LCSSAPhi.getNumIncomingValues() == 1) { 4402 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4403 // Non-instruction incoming values will have only one value. 4404 unsigned LastLane = 0; 4405 if (isa<Instruction>(IncomingValue)) 4406 LastLane = Cost->isUniformAfterVectorization( 4407 cast<Instruction>(IncomingValue), VF) 4408 ? 0 4409 : VF.getKnownMinValue() - 1; 4410 assert((!VF.isScalable() || LastLane == 0) && 4411 "scalable vectors dont support non-uniform scalars yet"); 4412 // Can be a loop invariant incoming value or the last scalar value to be 4413 // extracted from the vectorized loop. 4414 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4415 Value *lastIncomingValue = 4416 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4417 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4418 } 4419 } 4420 } 4421 4422 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4423 // The basic block and loop containing the predicated instruction. 4424 auto *PredBB = PredInst->getParent(); 4425 auto *VectorLoop = LI->getLoopFor(PredBB); 4426 4427 // Initialize a worklist with the operands of the predicated instruction. 4428 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4429 4430 // Holds instructions that we need to analyze again. An instruction may be 4431 // reanalyzed if we don't yet know if we can sink it or not. 4432 SmallVector<Instruction *, 8> InstsToReanalyze; 4433 4434 // Returns true if a given use occurs in the predicated block. Phi nodes use 4435 // their operands in their corresponding predecessor blocks. 4436 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4437 auto *I = cast<Instruction>(U.getUser()); 4438 BasicBlock *BB = I->getParent(); 4439 if (auto *Phi = dyn_cast<PHINode>(I)) 4440 BB = Phi->getIncomingBlock( 4441 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4442 return BB == PredBB; 4443 }; 4444 4445 // Iteratively sink the scalarized operands of the predicated instruction 4446 // into the block we created for it. When an instruction is sunk, it's 4447 // operands are then added to the worklist. The algorithm ends after one pass 4448 // through the worklist doesn't sink a single instruction. 4449 bool Changed; 4450 do { 4451 // Add the instructions that need to be reanalyzed to the worklist, and 4452 // reset the changed indicator. 4453 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4454 InstsToReanalyze.clear(); 4455 Changed = false; 4456 4457 while (!Worklist.empty()) { 4458 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4459 4460 // We can't sink an instruction if it is a phi node, is already in the 4461 // predicated block, is not in the loop, or may have side effects. 4462 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4463 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4464 continue; 4465 4466 // It's legal to sink the instruction if all its uses occur in the 4467 // predicated block. Otherwise, there's nothing to do yet, and we may 4468 // need to reanalyze the instruction. 4469 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4470 InstsToReanalyze.push_back(I); 4471 continue; 4472 } 4473 4474 // Move the instruction to the beginning of the predicated block, and add 4475 // it's operands to the worklist. 4476 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4477 Worklist.insert(I->op_begin(), I->op_end()); 4478 4479 // The sinking may have enabled other instructions to be sunk, so we will 4480 // need to iterate. 4481 Changed = true; 4482 } 4483 } while (Changed); 4484 } 4485 4486 void InnerLoopVectorizer::fixNonInductionPHIs() { 4487 for (PHINode *OrigPhi : OrigPHIsToFix) { 4488 PHINode *NewPhi = 4489 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4490 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4491 4492 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4493 predecessors(OrigPhi->getParent())); 4494 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4495 predecessors(NewPhi->getParent())); 4496 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4497 "Scalar and Vector BB should have the same number of predecessors"); 4498 4499 // The insertion point in Builder may be invalidated by the time we get 4500 // here. Force the Builder insertion point to something valid so that we do 4501 // not run into issues during insertion point restore in 4502 // getOrCreateVectorValue calls below. 4503 Builder.SetInsertPoint(NewPhi); 4504 4505 // The predecessor order is preserved and we can rely on mapping between 4506 // scalar and vector block predecessors. 4507 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4508 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4509 4510 // When looking up the new scalar/vector values to fix up, use incoming 4511 // values from original phi. 4512 Value *ScIncV = 4513 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4514 4515 // Scalar incoming value may need a broadcast 4516 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4517 NewPhi->addIncoming(NewIncV, NewPredBB); 4518 } 4519 } 4520 } 4521 4522 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4523 VPUser &Operands, unsigned UF, 4524 ElementCount VF, bool IsPtrLoopInvariant, 4525 SmallBitVector &IsIndexLoopInvariant, 4526 VPTransformState &State) { 4527 // Construct a vector GEP by widening the operands of the scalar GEP as 4528 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4529 // results in a vector of pointers when at least one operand of the GEP 4530 // is vector-typed. Thus, to keep the representation compact, we only use 4531 // vector-typed operands for loop-varying values. 4532 4533 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4534 // If we are vectorizing, but the GEP has only loop-invariant operands, 4535 // the GEP we build (by only using vector-typed operands for 4536 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4537 // produce a vector of pointers, we need to either arbitrarily pick an 4538 // operand to broadcast, or broadcast a clone of the original GEP. 4539 // Here, we broadcast a clone of the original. 4540 // 4541 // TODO: If at some point we decide to scalarize instructions having 4542 // loop-invariant operands, this special case will no longer be 4543 // required. We would add the scalarization decision to 4544 // collectLoopScalars() and teach getVectorValue() to broadcast 4545 // the lane-zero scalar value. 4546 auto *Clone = Builder.Insert(GEP->clone()); 4547 for (unsigned Part = 0; Part < UF; ++Part) { 4548 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4549 State.set(VPDef, GEP, EntryPart, Part); 4550 addMetadata(EntryPart, GEP); 4551 } 4552 } else { 4553 // If the GEP has at least one loop-varying operand, we are sure to 4554 // produce a vector of pointers. But if we are only unrolling, we want 4555 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4556 // produce with the code below will be scalar (if VF == 1) or vector 4557 // (otherwise). Note that for the unroll-only case, we still maintain 4558 // values in the vector mapping with initVector, as we do for other 4559 // instructions. 4560 for (unsigned Part = 0; Part < UF; ++Part) { 4561 // The pointer operand of the new GEP. If it's loop-invariant, we 4562 // won't broadcast it. 4563 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4564 : State.get(Operands.getOperand(0), Part); 4565 4566 // Collect all the indices for the new GEP. If any index is 4567 // loop-invariant, we won't broadcast it. 4568 SmallVector<Value *, 4> Indices; 4569 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4570 VPValue *Operand = Operands.getOperand(I); 4571 if (IsIndexLoopInvariant[I - 1]) 4572 Indices.push_back(State.get(Operand, {0, 0})); 4573 else 4574 Indices.push_back(State.get(Operand, Part)); 4575 } 4576 4577 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4578 // but it should be a vector, otherwise. 4579 auto *NewGEP = 4580 GEP->isInBounds() 4581 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4582 Indices) 4583 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4584 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4585 "NewGEP is not a pointer vector"); 4586 State.set(VPDef, GEP, NewGEP, Part); 4587 addMetadata(NewGEP, GEP); 4588 } 4589 } 4590 } 4591 4592 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4593 ElementCount VF) { 4594 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4595 PHINode *P = cast<PHINode>(PN); 4596 if (EnableVPlanNativePath) { 4597 // Currently we enter here in the VPlan-native path for non-induction 4598 // PHIs where all control flow is uniform. We simply widen these PHIs. 4599 // Create a vector phi with no operands - the vector phi operands will be 4600 // set at the end of vector code generation. 4601 Type *VecTy = 4602 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4603 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4604 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4605 OrigPHIsToFix.push_back(P); 4606 4607 return; 4608 } 4609 4610 assert(PN->getParent() == OrigLoop->getHeader() && 4611 "Non-header phis should have been handled elsewhere"); 4612 4613 // In order to support recurrences we need to be able to vectorize Phi nodes. 4614 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4615 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4616 // this value when we vectorize all of the instructions that use the PHI. 4617 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4618 for (unsigned Part = 0; Part < UF; ++Part) { 4619 // This is phase one of vectorizing PHIs. 4620 bool ScalarPHI = 4621 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4622 Type *VecTy = 4623 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4624 Value *EntryPart = PHINode::Create( 4625 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4626 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4627 } 4628 return; 4629 } 4630 4631 setDebugLocFromInst(Builder, P); 4632 4633 // This PHINode must be an induction variable. 4634 // Make sure that we know about it. 4635 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4636 4637 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4638 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4639 4640 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4641 // which can be found from the original scalar operations. 4642 switch (II.getKind()) { 4643 case InductionDescriptor::IK_NoInduction: 4644 llvm_unreachable("Unknown induction"); 4645 case InductionDescriptor::IK_IntInduction: 4646 case InductionDescriptor::IK_FpInduction: 4647 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4648 case InductionDescriptor::IK_PtrInduction: { 4649 // Handle the pointer induction variable case. 4650 assert(P->getType()->isPointerTy() && "Unexpected type."); 4651 4652 if (Cost->isScalarAfterVectorization(P, VF)) { 4653 // This is the normalized GEP that starts counting at zero. 4654 Value *PtrInd = 4655 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4656 // Determine the number of scalars we need to generate for each unroll 4657 // iteration. If the instruction is uniform, we only need to generate the 4658 // first lane. Otherwise, we generate all VF values. 4659 unsigned Lanes = 4660 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4661 for (unsigned Part = 0; Part < UF; ++Part) { 4662 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4663 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4664 Lane + Part * VF.getKnownMinValue()); 4665 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4666 Value *SclrGep = 4667 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4668 SclrGep->setName("next.gep"); 4669 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4670 } 4671 } 4672 return; 4673 } 4674 assert(isa<SCEVConstant>(II.getStep()) && 4675 "Induction step not a SCEV constant!"); 4676 Type *PhiType = II.getStep()->getType(); 4677 4678 // Build a pointer phi 4679 Value *ScalarStartValue = II.getStartValue(); 4680 Type *ScStValueType = ScalarStartValue->getType(); 4681 PHINode *NewPointerPhi = 4682 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4683 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4684 4685 // A pointer induction, performed by using a gep 4686 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4687 Instruction *InductionLoc = LoopLatch->getTerminator(); 4688 const SCEV *ScalarStep = II.getStep(); 4689 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4690 Value *ScalarStepValue = 4691 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4692 Value *InductionGEP = GetElementPtrInst::Create( 4693 ScStValueType->getPointerElementType(), NewPointerPhi, 4694 Builder.CreateMul( 4695 ScalarStepValue, 4696 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4697 "ptr.ind", InductionLoc); 4698 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4699 4700 // Create UF many actual address geps that use the pointer 4701 // phi as base and a vectorized version of the step value 4702 // (<step*0, ..., step*N>) as offset. 4703 for (unsigned Part = 0; Part < UF; ++Part) { 4704 SmallVector<Constant *, 8> Indices; 4705 // Create a vector of consecutive numbers from zero to VF. 4706 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4707 Indices.push_back( 4708 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4709 Constant *StartOffset = ConstantVector::get(Indices); 4710 4711 Value *GEP = Builder.CreateGEP( 4712 ScStValueType->getPointerElementType(), NewPointerPhi, 4713 Builder.CreateMul( 4714 StartOffset, 4715 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4716 "vector.gep")); 4717 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4718 } 4719 } 4720 } 4721 } 4722 4723 /// A helper function for checking whether an integer division-related 4724 /// instruction may divide by zero (in which case it must be predicated if 4725 /// executed conditionally in the scalar code). 4726 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4727 /// Non-zero divisors that are non compile-time constants will not be 4728 /// converted into multiplication, so we will still end up scalarizing 4729 /// the division, but can do so w/o predication. 4730 static bool mayDivideByZero(Instruction &I) { 4731 assert((I.getOpcode() == Instruction::UDiv || 4732 I.getOpcode() == Instruction::SDiv || 4733 I.getOpcode() == Instruction::URem || 4734 I.getOpcode() == Instruction::SRem) && 4735 "Unexpected instruction"); 4736 Value *Divisor = I.getOperand(1); 4737 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4738 return !CInt || CInt->isZero(); 4739 } 4740 4741 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4742 VPUser &User, 4743 VPTransformState &State) { 4744 switch (I.getOpcode()) { 4745 case Instruction::Call: 4746 case Instruction::Br: 4747 case Instruction::PHI: 4748 case Instruction::GetElementPtr: 4749 case Instruction::Select: 4750 llvm_unreachable("This instruction is handled by a different recipe."); 4751 case Instruction::UDiv: 4752 case Instruction::SDiv: 4753 case Instruction::SRem: 4754 case Instruction::URem: 4755 case Instruction::Add: 4756 case Instruction::FAdd: 4757 case Instruction::Sub: 4758 case Instruction::FSub: 4759 case Instruction::FNeg: 4760 case Instruction::Mul: 4761 case Instruction::FMul: 4762 case Instruction::FDiv: 4763 case Instruction::FRem: 4764 case Instruction::Shl: 4765 case Instruction::LShr: 4766 case Instruction::AShr: 4767 case Instruction::And: 4768 case Instruction::Or: 4769 case Instruction::Xor: { 4770 // Just widen unops and binops. 4771 setDebugLocFromInst(Builder, &I); 4772 4773 for (unsigned Part = 0; Part < UF; ++Part) { 4774 SmallVector<Value *, 2> Ops; 4775 for (VPValue *VPOp : User.operands()) 4776 Ops.push_back(State.get(VPOp, Part)); 4777 4778 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4779 4780 if (auto *VecOp = dyn_cast<Instruction>(V)) 4781 VecOp->copyIRFlags(&I); 4782 4783 // Use this vector value for all users of the original instruction. 4784 State.set(Def, &I, V, Part); 4785 addMetadata(V, &I); 4786 } 4787 4788 break; 4789 } 4790 case Instruction::ICmp: 4791 case Instruction::FCmp: { 4792 // Widen compares. Generate vector compares. 4793 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4794 auto *Cmp = cast<CmpInst>(&I); 4795 setDebugLocFromInst(Builder, Cmp); 4796 for (unsigned Part = 0; Part < UF; ++Part) { 4797 Value *A = State.get(User.getOperand(0), Part); 4798 Value *B = State.get(User.getOperand(1), Part); 4799 Value *C = nullptr; 4800 if (FCmp) { 4801 // Propagate fast math flags. 4802 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4803 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4804 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4805 } else { 4806 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4807 } 4808 State.set(Def, &I, C, Part); 4809 addMetadata(C, &I); 4810 } 4811 4812 break; 4813 } 4814 4815 case Instruction::ZExt: 4816 case Instruction::SExt: 4817 case Instruction::FPToUI: 4818 case Instruction::FPToSI: 4819 case Instruction::FPExt: 4820 case Instruction::PtrToInt: 4821 case Instruction::IntToPtr: 4822 case Instruction::SIToFP: 4823 case Instruction::UIToFP: 4824 case Instruction::Trunc: 4825 case Instruction::FPTrunc: 4826 case Instruction::BitCast: { 4827 auto *CI = cast<CastInst>(&I); 4828 setDebugLocFromInst(Builder, CI); 4829 4830 /// Vectorize casts. 4831 Type *DestTy = 4832 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4833 4834 for (unsigned Part = 0; Part < UF; ++Part) { 4835 Value *A = State.get(User.getOperand(0), Part); 4836 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4837 State.set(Def, &I, Cast, Part); 4838 addMetadata(Cast, &I); 4839 } 4840 break; 4841 } 4842 default: 4843 // This instruction is not vectorized by simple widening. 4844 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4845 llvm_unreachable("Unhandled instruction!"); 4846 } // end of switch. 4847 } 4848 4849 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4850 VPUser &ArgOperands, 4851 VPTransformState &State) { 4852 assert(!isa<DbgInfoIntrinsic>(I) && 4853 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4854 setDebugLocFromInst(Builder, &I); 4855 4856 Module *M = I.getParent()->getParent()->getParent(); 4857 auto *CI = cast<CallInst>(&I); 4858 4859 SmallVector<Type *, 4> Tys; 4860 for (Value *ArgOperand : CI->arg_operands()) 4861 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4862 4863 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4864 4865 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4866 // version of the instruction. 4867 // Is it beneficial to perform intrinsic call compared to lib call? 4868 bool NeedToScalarize = false; 4869 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4870 bool UseVectorIntrinsic = 4871 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4872 assert((UseVectorIntrinsic || !NeedToScalarize) && 4873 "Instruction should be scalarized elsewhere."); 4874 4875 for (unsigned Part = 0; Part < UF; ++Part) { 4876 SmallVector<Value *, 4> Args; 4877 for (auto &I : enumerate(ArgOperands.operands())) { 4878 // Some intrinsics have a scalar argument - don't replace it with a 4879 // vector. 4880 Value *Arg; 4881 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4882 Arg = State.get(I.value(), Part); 4883 else 4884 Arg = State.get(I.value(), {0, 0}); 4885 Args.push_back(Arg); 4886 } 4887 4888 Function *VectorF; 4889 if (UseVectorIntrinsic) { 4890 // Use vector version of the intrinsic. 4891 Type *TysForDecl[] = {CI->getType()}; 4892 if (VF.isVector()) { 4893 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4894 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4895 } 4896 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4897 assert(VectorF && "Can't retrieve vector intrinsic."); 4898 } else { 4899 // Use vector version of the function call. 4900 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4901 #ifndef NDEBUG 4902 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4903 "Can't create vector function."); 4904 #endif 4905 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4906 } 4907 SmallVector<OperandBundleDef, 1> OpBundles; 4908 CI->getOperandBundlesAsDefs(OpBundles); 4909 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4910 4911 if (isa<FPMathOperator>(V)) 4912 V->copyFastMathFlags(CI); 4913 4914 State.set(Def, &I, V, Part); 4915 addMetadata(V, &I); 4916 } 4917 } 4918 4919 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4920 VPUser &Operands, 4921 bool InvariantCond, 4922 VPTransformState &State) { 4923 setDebugLocFromInst(Builder, &I); 4924 4925 // The condition can be loop invariant but still defined inside the 4926 // loop. This means that we can't just use the original 'cond' value. 4927 // We have to take the 'vectorized' value and pick the first lane. 4928 // Instcombine will make this a no-op. 4929 auto *InvarCond = 4930 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4931 4932 for (unsigned Part = 0; Part < UF; ++Part) { 4933 Value *Cond = 4934 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4935 Value *Op0 = State.get(Operands.getOperand(1), Part); 4936 Value *Op1 = State.get(Operands.getOperand(2), Part); 4937 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4938 State.set(VPDef, &I, Sel, Part); 4939 addMetadata(Sel, &I); 4940 } 4941 } 4942 4943 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4944 // We should not collect Scalars more than once per VF. Right now, this 4945 // function is called from collectUniformsAndScalars(), which already does 4946 // this check. Collecting Scalars for VF=1 does not make any sense. 4947 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4948 "This function should not be visited twice for the same VF"); 4949 4950 SmallSetVector<Instruction *, 8> Worklist; 4951 4952 // These sets are used to seed the analysis with pointers used by memory 4953 // accesses that will remain scalar. 4954 SmallSetVector<Instruction *, 8> ScalarPtrs; 4955 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4956 auto *Latch = TheLoop->getLoopLatch(); 4957 4958 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4959 // The pointer operands of loads and stores will be scalar as long as the 4960 // memory access is not a gather or scatter operation. The value operand of a 4961 // store will remain scalar if the store is scalarized. 4962 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4963 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4964 assert(WideningDecision != CM_Unknown && 4965 "Widening decision should be ready at this moment"); 4966 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4967 if (Ptr == Store->getValueOperand()) 4968 return WideningDecision == CM_Scalarize; 4969 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4970 "Ptr is neither a value or pointer operand"); 4971 return WideningDecision != CM_GatherScatter; 4972 }; 4973 4974 // A helper that returns true if the given value is a bitcast or 4975 // getelementptr instruction contained in the loop. 4976 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4977 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4978 isa<GetElementPtrInst>(V)) && 4979 !TheLoop->isLoopInvariant(V); 4980 }; 4981 4982 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4983 if (!isa<PHINode>(Ptr) || 4984 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4985 return false; 4986 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4987 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4988 return false; 4989 return isScalarUse(MemAccess, Ptr); 4990 }; 4991 4992 // A helper that evaluates a memory access's use of a pointer. If the 4993 // pointer is actually the pointer induction of a loop, it is being 4994 // inserted into Worklist. If the use will be a scalar use, and the 4995 // pointer is only used by memory accesses, we place the pointer in 4996 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4997 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4998 if (isScalarPtrInduction(MemAccess, Ptr)) { 4999 Worklist.insert(cast<Instruction>(Ptr)); 5000 Instruction *Update = cast<Instruction>( 5001 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5002 Worklist.insert(Update); 5003 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5004 << "\n"); 5005 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5006 << "\n"); 5007 return; 5008 } 5009 // We only care about bitcast and getelementptr instructions contained in 5010 // the loop. 5011 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5012 return; 5013 5014 // If the pointer has already been identified as scalar (e.g., if it was 5015 // also identified as uniform), there's nothing to do. 5016 auto *I = cast<Instruction>(Ptr); 5017 if (Worklist.count(I)) 5018 return; 5019 5020 // If the use of the pointer will be a scalar use, and all users of the 5021 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5022 // place the pointer in PossibleNonScalarPtrs. 5023 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5024 return isa<LoadInst>(U) || isa<StoreInst>(U); 5025 })) 5026 ScalarPtrs.insert(I); 5027 else 5028 PossibleNonScalarPtrs.insert(I); 5029 }; 5030 5031 // We seed the scalars analysis with three classes of instructions: (1) 5032 // instructions marked uniform-after-vectorization and (2) bitcast, 5033 // getelementptr and (pointer) phi instructions used by memory accesses 5034 // requiring a scalar use. 5035 // 5036 // (1) Add to the worklist all instructions that have been identified as 5037 // uniform-after-vectorization. 5038 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5039 5040 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5041 // memory accesses requiring a scalar use. The pointer operands of loads and 5042 // stores will be scalar as long as the memory accesses is not a gather or 5043 // scatter operation. The value operand of a store will remain scalar if the 5044 // store is scalarized. 5045 for (auto *BB : TheLoop->blocks()) 5046 for (auto &I : *BB) { 5047 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5048 evaluatePtrUse(Load, Load->getPointerOperand()); 5049 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5050 evaluatePtrUse(Store, Store->getPointerOperand()); 5051 evaluatePtrUse(Store, Store->getValueOperand()); 5052 } 5053 } 5054 for (auto *I : ScalarPtrs) 5055 if (!PossibleNonScalarPtrs.count(I)) { 5056 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5057 Worklist.insert(I); 5058 } 5059 5060 // Insert the forced scalars. 5061 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5062 // induction variable when the PHI user is scalarized. 5063 auto ForcedScalar = ForcedScalars.find(VF); 5064 if (ForcedScalar != ForcedScalars.end()) 5065 for (auto *I : ForcedScalar->second) 5066 Worklist.insert(I); 5067 5068 // Expand the worklist by looking through any bitcasts and getelementptr 5069 // instructions we've already identified as scalar. This is similar to the 5070 // expansion step in collectLoopUniforms(); however, here we're only 5071 // expanding to include additional bitcasts and getelementptr instructions. 5072 unsigned Idx = 0; 5073 while (Idx != Worklist.size()) { 5074 Instruction *Dst = Worklist[Idx++]; 5075 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5076 continue; 5077 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5078 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5079 auto *J = cast<Instruction>(U); 5080 return !TheLoop->contains(J) || Worklist.count(J) || 5081 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5082 isScalarUse(J, Src)); 5083 })) { 5084 Worklist.insert(Src); 5085 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5086 } 5087 } 5088 5089 // An induction variable will remain scalar if all users of the induction 5090 // variable and induction variable update remain scalar. 5091 for (auto &Induction : Legal->getInductionVars()) { 5092 auto *Ind = Induction.first; 5093 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5094 5095 // If tail-folding is applied, the primary induction variable will be used 5096 // to feed a vector compare. 5097 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5098 continue; 5099 5100 // Determine if all users of the induction variable are scalar after 5101 // vectorization. 5102 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5103 auto *I = cast<Instruction>(U); 5104 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5105 }); 5106 if (!ScalarInd) 5107 continue; 5108 5109 // Determine if all users of the induction variable update instruction are 5110 // scalar after vectorization. 5111 auto ScalarIndUpdate = 5112 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5113 auto *I = cast<Instruction>(U); 5114 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5115 }); 5116 if (!ScalarIndUpdate) 5117 continue; 5118 5119 // The induction variable and its update instruction will remain scalar. 5120 Worklist.insert(Ind); 5121 Worklist.insert(IndUpdate); 5122 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5123 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5124 << "\n"); 5125 } 5126 5127 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5128 } 5129 5130 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5131 ElementCount VF) { 5132 if (!blockNeedsPredication(I->getParent())) 5133 return false; 5134 switch(I->getOpcode()) { 5135 default: 5136 break; 5137 case Instruction::Load: 5138 case Instruction::Store: { 5139 if (!Legal->isMaskRequired(I)) 5140 return false; 5141 auto *Ptr = getLoadStorePointerOperand(I); 5142 auto *Ty = getMemInstValueType(I); 5143 // We have already decided how to vectorize this instruction, get that 5144 // result. 5145 if (VF.isVector()) { 5146 InstWidening WideningDecision = getWideningDecision(I, VF); 5147 assert(WideningDecision != CM_Unknown && 5148 "Widening decision should be ready at this moment"); 5149 return WideningDecision == CM_Scalarize; 5150 } 5151 const Align Alignment = getLoadStoreAlignment(I); 5152 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5153 isLegalMaskedGather(Ty, Alignment)) 5154 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5155 isLegalMaskedScatter(Ty, Alignment)); 5156 } 5157 case Instruction::UDiv: 5158 case Instruction::SDiv: 5159 case Instruction::SRem: 5160 case Instruction::URem: 5161 return mayDivideByZero(*I); 5162 } 5163 return false; 5164 } 5165 5166 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5167 Instruction *I, ElementCount VF) { 5168 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5169 assert(getWideningDecision(I, VF) == CM_Unknown && 5170 "Decision should not be set yet."); 5171 auto *Group = getInterleavedAccessGroup(I); 5172 assert(Group && "Must have a group."); 5173 5174 // If the instruction's allocated size doesn't equal it's type size, it 5175 // requires padding and will be scalarized. 5176 auto &DL = I->getModule()->getDataLayout(); 5177 auto *ScalarTy = getMemInstValueType(I); 5178 if (hasIrregularType(ScalarTy, DL, VF)) 5179 return false; 5180 5181 // Check if masking is required. 5182 // A Group may need masking for one of two reasons: it resides in a block that 5183 // needs predication, or it was decided to use masking to deal with gaps. 5184 bool PredicatedAccessRequiresMasking = 5185 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5186 bool AccessWithGapsRequiresMasking = 5187 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5188 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5189 return true; 5190 5191 // If masked interleaving is required, we expect that the user/target had 5192 // enabled it, because otherwise it either wouldn't have been created or 5193 // it should have been invalidated by the CostModel. 5194 assert(useMaskedInterleavedAccesses(TTI) && 5195 "Masked interleave-groups for predicated accesses are not enabled."); 5196 5197 auto *Ty = getMemInstValueType(I); 5198 const Align Alignment = getLoadStoreAlignment(I); 5199 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5200 : TTI.isLegalMaskedStore(Ty, Alignment); 5201 } 5202 5203 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5204 Instruction *I, ElementCount VF) { 5205 // Get and ensure we have a valid memory instruction. 5206 LoadInst *LI = dyn_cast<LoadInst>(I); 5207 StoreInst *SI = dyn_cast<StoreInst>(I); 5208 assert((LI || SI) && "Invalid memory instruction"); 5209 5210 auto *Ptr = getLoadStorePointerOperand(I); 5211 5212 // In order to be widened, the pointer should be consecutive, first of all. 5213 if (!Legal->isConsecutivePtr(Ptr)) 5214 return false; 5215 5216 // If the instruction is a store located in a predicated block, it will be 5217 // scalarized. 5218 if (isScalarWithPredication(I)) 5219 return false; 5220 5221 // If the instruction's allocated size doesn't equal it's type size, it 5222 // requires padding and will be scalarized. 5223 auto &DL = I->getModule()->getDataLayout(); 5224 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5225 if (hasIrregularType(ScalarTy, DL, VF)) 5226 return false; 5227 5228 return true; 5229 } 5230 5231 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5232 // We should not collect Uniforms more than once per VF. Right now, 5233 // this function is called from collectUniformsAndScalars(), which 5234 // already does this check. Collecting Uniforms for VF=1 does not make any 5235 // sense. 5236 5237 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5238 "This function should not be visited twice for the same VF"); 5239 5240 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5241 // not analyze again. Uniforms.count(VF) will return 1. 5242 Uniforms[VF].clear(); 5243 5244 // We now know that the loop is vectorizable! 5245 // Collect instructions inside the loop that will remain uniform after 5246 // vectorization. 5247 5248 // Global values, params and instructions outside of current loop are out of 5249 // scope. 5250 auto isOutOfScope = [&](Value *V) -> bool { 5251 Instruction *I = dyn_cast<Instruction>(V); 5252 return (!I || !TheLoop->contains(I)); 5253 }; 5254 5255 SetVector<Instruction *> Worklist; 5256 BasicBlock *Latch = TheLoop->getLoopLatch(); 5257 5258 // Instructions that are scalar with predication must not be considered 5259 // uniform after vectorization, because that would create an erroneous 5260 // replicating region where only a single instance out of VF should be formed. 5261 // TODO: optimize such seldom cases if found important, see PR40816. 5262 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5263 if (isOutOfScope(I)) { 5264 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5265 << *I << "\n"); 5266 return; 5267 } 5268 if (isScalarWithPredication(I, VF)) { 5269 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5270 << *I << "\n"); 5271 return; 5272 } 5273 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5274 Worklist.insert(I); 5275 }; 5276 5277 // Start with the conditional branch. If the branch condition is an 5278 // instruction contained in the loop that is only used by the branch, it is 5279 // uniform. 5280 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5281 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5282 addToWorklistIfAllowed(Cmp); 5283 5284 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5285 InstWidening WideningDecision = getWideningDecision(I, VF); 5286 assert(WideningDecision != CM_Unknown && 5287 "Widening decision should be ready at this moment"); 5288 5289 // A uniform memory op is itself uniform. We exclude uniform stores 5290 // here as they demand the last lane, not the first one. 5291 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5292 assert(WideningDecision == CM_Scalarize); 5293 return true; 5294 } 5295 5296 return (WideningDecision == CM_Widen || 5297 WideningDecision == CM_Widen_Reverse || 5298 WideningDecision == CM_Interleave); 5299 }; 5300 5301 5302 // Returns true if Ptr is the pointer operand of a memory access instruction 5303 // I, and I is known to not require scalarization. 5304 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5305 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5306 }; 5307 5308 // Holds a list of values which are known to have at least one uniform use. 5309 // Note that there may be other uses which aren't uniform. A "uniform use" 5310 // here is something which only demands lane 0 of the unrolled iterations; 5311 // it does not imply that all lanes produce the same value (e.g. this is not 5312 // the usual meaning of uniform) 5313 SmallPtrSet<Value *, 8> HasUniformUse; 5314 5315 // Scan the loop for instructions which are either a) known to have only 5316 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5317 for (auto *BB : TheLoop->blocks()) 5318 for (auto &I : *BB) { 5319 // If there's no pointer operand, there's nothing to do. 5320 auto *Ptr = getLoadStorePointerOperand(&I); 5321 if (!Ptr) 5322 continue; 5323 5324 // A uniform memory op is itself uniform. We exclude uniform stores 5325 // here as they demand the last lane, not the first one. 5326 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5327 addToWorklistIfAllowed(&I); 5328 5329 if (isUniformDecision(&I, VF)) { 5330 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5331 HasUniformUse.insert(Ptr); 5332 } 5333 } 5334 5335 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5336 // demanding) users. Since loops are assumed to be in LCSSA form, this 5337 // disallows uses outside the loop as well. 5338 for (auto *V : HasUniformUse) { 5339 if (isOutOfScope(V)) 5340 continue; 5341 auto *I = cast<Instruction>(V); 5342 auto UsersAreMemAccesses = 5343 llvm::all_of(I->users(), [&](User *U) -> bool { 5344 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5345 }); 5346 if (UsersAreMemAccesses) 5347 addToWorklistIfAllowed(I); 5348 } 5349 5350 // Expand Worklist in topological order: whenever a new instruction 5351 // is added , its users should be already inside Worklist. It ensures 5352 // a uniform instruction will only be used by uniform instructions. 5353 unsigned idx = 0; 5354 while (idx != Worklist.size()) { 5355 Instruction *I = Worklist[idx++]; 5356 5357 for (auto OV : I->operand_values()) { 5358 // isOutOfScope operands cannot be uniform instructions. 5359 if (isOutOfScope(OV)) 5360 continue; 5361 // First order recurrence Phi's should typically be considered 5362 // non-uniform. 5363 auto *OP = dyn_cast<PHINode>(OV); 5364 if (OP && Legal->isFirstOrderRecurrence(OP)) 5365 continue; 5366 // If all the users of the operand are uniform, then add the 5367 // operand into the uniform worklist. 5368 auto *OI = cast<Instruction>(OV); 5369 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5370 auto *J = cast<Instruction>(U); 5371 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5372 })) 5373 addToWorklistIfAllowed(OI); 5374 } 5375 } 5376 5377 // For an instruction to be added into Worklist above, all its users inside 5378 // the loop should also be in Worklist. However, this condition cannot be 5379 // true for phi nodes that form a cyclic dependence. We must process phi 5380 // nodes separately. An induction variable will remain uniform if all users 5381 // of the induction variable and induction variable update remain uniform. 5382 // The code below handles both pointer and non-pointer induction variables. 5383 for (auto &Induction : Legal->getInductionVars()) { 5384 auto *Ind = Induction.first; 5385 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5386 5387 // Determine if all users of the induction variable are uniform after 5388 // vectorization. 5389 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5390 auto *I = cast<Instruction>(U); 5391 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5392 isVectorizedMemAccessUse(I, Ind); 5393 }); 5394 if (!UniformInd) 5395 continue; 5396 5397 // Determine if all users of the induction variable update instruction are 5398 // uniform after vectorization. 5399 auto UniformIndUpdate = 5400 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5401 auto *I = cast<Instruction>(U); 5402 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5403 isVectorizedMemAccessUse(I, IndUpdate); 5404 }); 5405 if (!UniformIndUpdate) 5406 continue; 5407 5408 // The induction variable and its update instruction will remain uniform. 5409 addToWorklistIfAllowed(Ind); 5410 addToWorklistIfAllowed(IndUpdate); 5411 } 5412 5413 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5414 } 5415 5416 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5417 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5418 5419 if (Legal->getRuntimePointerChecking()->Need) { 5420 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5421 "runtime pointer checks needed. Enable vectorization of this " 5422 "loop with '#pragma clang loop vectorize(enable)' when " 5423 "compiling with -Os/-Oz", 5424 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5425 return true; 5426 } 5427 5428 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5429 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5430 "runtime SCEV checks needed. Enable vectorization of this " 5431 "loop with '#pragma clang loop vectorize(enable)' when " 5432 "compiling with -Os/-Oz", 5433 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5434 return true; 5435 } 5436 5437 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5438 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5439 reportVectorizationFailure("Runtime stride check for small trip count", 5440 "runtime stride == 1 checks needed. Enable vectorization of " 5441 "this loop without such check by compiling with -Os/-Oz", 5442 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5443 return true; 5444 } 5445 5446 return false; 5447 } 5448 5449 Optional<ElementCount> 5450 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5451 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5452 // TODO: It may by useful to do since it's still likely to be dynamically 5453 // uniform if the target can skip. 5454 reportVectorizationFailure( 5455 "Not inserting runtime ptr check for divergent target", 5456 "runtime pointer checks needed. Not enabled for divergent target", 5457 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5458 return None; 5459 } 5460 5461 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5462 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5463 if (TC == 1) { 5464 reportVectorizationFailure("Single iteration (non) loop", 5465 "loop trip count is one, irrelevant for vectorization", 5466 "SingleIterationLoop", ORE, TheLoop); 5467 return None; 5468 } 5469 5470 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5471 5472 switch (ScalarEpilogueStatus) { 5473 case CM_ScalarEpilogueAllowed: 5474 return MaxVF; 5475 case CM_ScalarEpilogueNotAllowedUsePredicate: 5476 LLVM_FALLTHROUGH; 5477 case CM_ScalarEpilogueNotNeededUsePredicate: 5478 LLVM_DEBUG( 5479 dbgs() << "LV: vector predicate hint/switch found.\n" 5480 << "LV: Not allowing scalar epilogue, creating predicated " 5481 << "vector loop.\n"); 5482 break; 5483 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5484 // fallthrough as a special case of OptForSize 5485 case CM_ScalarEpilogueNotAllowedOptSize: 5486 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5487 LLVM_DEBUG( 5488 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5489 else 5490 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5491 << "count.\n"); 5492 5493 // Bail if runtime checks are required, which are not good when optimising 5494 // for size. 5495 if (runtimeChecksRequired()) 5496 return None; 5497 5498 break; 5499 } 5500 5501 // The only loops we can vectorize without a scalar epilogue, are loops with 5502 // a bottom-test and a single exiting block. We'd have to handle the fact 5503 // that not every instruction executes on the last iteration. This will 5504 // require a lane mask which varies through the vector loop body. (TODO) 5505 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5506 // If there was a tail-folding hint/switch, but we can't fold the tail by 5507 // masking, fallback to a vectorization with a scalar epilogue. 5508 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5509 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5510 "scalar epilogue instead.\n"); 5511 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5512 return MaxVF; 5513 } 5514 return None; 5515 } 5516 5517 // Now try the tail folding 5518 5519 // Invalidate interleave groups that require an epilogue if we can't mask 5520 // the interleave-group. 5521 if (!useMaskedInterleavedAccesses(TTI)) { 5522 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5523 "No decisions should have been taken at this point"); 5524 // Note: There is no need to invalidate any cost modeling decisions here, as 5525 // non where taken so far. 5526 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5527 } 5528 5529 assert(!MaxVF.isScalable() && 5530 "Scalable vectors do not yet support tail folding"); 5531 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5532 "MaxVF must be a power of 2"); 5533 unsigned MaxVFtimesIC = 5534 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5535 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5536 // chose. 5537 ScalarEvolution *SE = PSE.getSE(); 5538 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5539 const SCEV *ExitCount = SE->getAddExpr( 5540 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5541 const SCEV *Rem = SE->getURemExpr( 5542 ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5543 if (Rem->isZero()) { 5544 // Accept MaxVF if we do not have a tail. 5545 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5546 return MaxVF; 5547 } 5548 5549 // If we don't know the precise trip count, or if the trip count that we 5550 // found modulo the vectorization factor is not zero, try to fold the tail 5551 // by masking. 5552 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5553 if (Legal->prepareToFoldTailByMasking()) { 5554 FoldTailByMasking = true; 5555 return MaxVF; 5556 } 5557 5558 // If there was a tail-folding hint/switch, but we can't fold the tail by 5559 // masking, fallback to a vectorization with a scalar epilogue. 5560 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5561 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5562 "scalar epilogue instead.\n"); 5563 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5564 return MaxVF; 5565 } 5566 5567 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5568 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5569 return None; 5570 } 5571 5572 if (TC == 0) { 5573 reportVectorizationFailure( 5574 "Unable to calculate the loop count due to complex control flow", 5575 "unable to calculate the loop count due to complex control flow", 5576 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5577 return None; 5578 } 5579 5580 reportVectorizationFailure( 5581 "Cannot optimize for size and vectorize at the same time.", 5582 "cannot optimize for size and vectorize at the same time. " 5583 "Enable vectorization of this loop with '#pragma clang loop " 5584 "vectorize(enable)' when compiling with -Os/-Oz", 5585 "NoTailLoopWithOptForSize", ORE, TheLoop); 5586 return None; 5587 } 5588 5589 ElementCount 5590 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5591 ElementCount UserVF) { 5592 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5593 unsigned SmallestType, WidestType; 5594 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5595 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5596 5597 // Get the maximum safe dependence distance in bits computed by LAA. 5598 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5599 // the memory accesses that is most restrictive (involved in the smallest 5600 // dependence distance). 5601 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5602 5603 if (UserVF.isNonZero()) { 5604 // For now, don't verify legality of scalable vectors. 5605 // This will be addressed properly in https://reviews.llvm.org/D91718. 5606 if (UserVF.isScalable()) 5607 return UserVF; 5608 5609 // If legally unsafe, clamp the user vectorization factor to a safe value. 5610 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5611 if (UserVF.getFixedValue() <= MaxSafeVF) 5612 return UserVF; 5613 5614 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5615 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5616 << ".\n"); 5617 ORE->emit([&]() { 5618 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5619 TheLoop->getStartLoc(), 5620 TheLoop->getHeader()) 5621 << "User-specified vectorization factor " 5622 << ore::NV("UserVectorizationFactor", UserVF) 5623 << " is unsafe, clamping to maximum safe vectorization factor " 5624 << ore::NV("VectorizationFactor", MaxSafeVF); 5625 }); 5626 return ElementCount::getFixed(MaxSafeVF); 5627 } 5628 5629 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5630 5631 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5632 // Note that both WidestRegister and WidestType may not be a powers of 2. 5633 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5634 5635 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5636 << " / " << WidestType << " bits.\n"); 5637 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5638 << WidestRegister << " bits.\n"); 5639 5640 assert(MaxVectorSize <= WidestRegister && 5641 "Did not expect to pack so many elements" 5642 " into one vector!"); 5643 if (MaxVectorSize == 0) { 5644 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5645 MaxVectorSize = 1; 5646 return ElementCount::getFixed(MaxVectorSize); 5647 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5648 isPowerOf2_32(ConstTripCount)) { 5649 // We need to clamp the VF to be the ConstTripCount. There is no point in 5650 // choosing a higher viable VF as done in the loop below. 5651 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5652 << ConstTripCount << "\n"); 5653 MaxVectorSize = ConstTripCount; 5654 return ElementCount::getFixed(MaxVectorSize); 5655 } 5656 5657 unsigned MaxVF = MaxVectorSize; 5658 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5659 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5660 // Collect all viable vectorization factors larger than the default MaxVF 5661 // (i.e. MaxVectorSize). 5662 SmallVector<ElementCount, 8> VFs; 5663 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5664 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5665 VFs.push_back(ElementCount::getFixed(VS)); 5666 5667 // For each VF calculate its register usage. 5668 auto RUs = calculateRegisterUsage(VFs); 5669 5670 // Select the largest VF which doesn't require more registers than existing 5671 // ones. 5672 for (int i = RUs.size() - 1; i >= 0; --i) { 5673 bool Selected = true; 5674 for (auto& pair : RUs[i].MaxLocalUsers) { 5675 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5676 if (pair.second > TargetNumRegisters) 5677 Selected = false; 5678 } 5679 if (Selected) { 5680 MaxVF = VFs[i].getKnownMinValue(); 5681 break; 5682 } 5683 } 5684 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5685 if (MaxVF < MinVF) { 5686 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5687 << ") with target's minimum: " << MinVF << '\n'); 5688 MaxVF = MinVF; 5689 } 5690 } 5691 } 5692 return ElementCount::getFixed(MaxVF); 5693 } 5694 5695 VectorizationFactor 5696 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5697 // FIXME: This can be fixed for scalable vectors later, because at this stage 5698 // the LoopVectorizer will only consider vectorizing a loop with scalable 5699 // vectors when the loop has a hint to enable vectorization for a given VF. 5700 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5701 5702 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5703 const float ScalarCost = Cost; 5704 unsigned Width = 1; 5705 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5706 5707 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5708 if (ForceVectorization && MaxVF.isVector()) { 5709 // Ignore scalar width, because the user explicitly wants vectorization. 5710 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5711 // evaluation. 5712 Cost = std::numeric_limits<float>::max(); 5713 } 5714 5715 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5716 // Notice that the vector loop needs to be executed less times, so 5717 // we need to divide the cost of the vector loops by the width of 5718 // the vector elements. 5719 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5720 float VectorCost = C.first / (float)i; 5721 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5722 << " costs: " << (int)VectorCost << ".\n"); 5723 if (!C.second && !ForceVectorization) { 5724 LLVM_DEBUG( 5725 dbgs() << "LV: Not considering vector loop of width " << i 5726 << " because it will not generate any vector instructions.\n"); 5727 continue; 5728 } 5729 5730 // If profitable add it to ProfitableVF list. 5731 if (VectorCost < ScalarCost) { 5732 ProfitableVFs.push_back(VectorizationFactor( 5733 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5734 } 5735 5736 if (VectorCost < Cost) { 5737 Cost = VectorCost; 5738 Width = i; 5739 } 5740 } 5741 5742 if (!EnableCondStoresVectorization && NumPredStores) { 5743 reportVectorizationFailure("There are conditional stores.", 5744 "store that is conditionally executed prevents vectorization", 5745 "ConditionalStore", ORE, TheLoop); 5746 Width = 1; 5747 Cost = ScalarCost; 5748 } 5749 5750 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5751 << "LV: Vectorization seems to be not beneficial, " 5752 << "but was forced by a user.\n"); 5753 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5754 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5755 (unsigned)(Width * Cost)}; 5756 return Factor; 5757 } 5758 5759 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5760 const Loop &L, ElementCount VF) const { 5761 // Cross iteration phis such as reductions need special handling and are 5762 // currently unsupported. 5763 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5764 return Legal->isFirstOrderRecurrence(&Phi) || 5765 Legal->isReductionVariable(&Phi); 5766 })) 5767 return false; 5768 5769 // Phis with uses outside of the loop require special handling and are 5770 // currently unsupported. 5771 for (auto &Entry : Legal->getInductionVars()) { 5772 // Look for uses of the value of the induction at the last iteration. 5773 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5774 for (User *U : PostInc->users()) 5775 if (!L.contains(cast<Instruction>(U))) 5776 return false; 5777 // Look for uses of penultimate value of the induction. 5778 for (User *U : Entry.first->users()) 5779 if (!L.contains(cast<Instruction>(U))) 5780 return false; 5781 } 5782 5783 // Induction variables that are widened require special handling that is 5784 // currently not supported. 5785 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5786 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5787 this->isProfitableToScalarize(Entry.first, VF)); 5788 })) 5789 return false; 5790 5791 return true; 5792 } 5793 5794 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5795 const ElementCount VF) const { 5796 // FIXME: We need a much better cost-model to take different parameters such 5797 // as register pressure, code size increase and cost of extra branches into 5798 // account. For now we apply a very crude heuristic and only consider loops 5799 // with vectorization factors larger than a certain value. 5800 // We also consider epilogue vectorization unprofitable for targets that don't 5801 // consider interleaving beneficial (eg. MVE). 5802 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5803 return false; 5804 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5805 return true; 5806 return false; 5807 } 5808 5809 VectorizationFactor 5810 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5811 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5812 VectorizationFactor Result = VectorizationFactor::Disabled(); 5813 if (!EnableEpilogueVectorization) { 5814 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5815 return Result; 5816 } 5817 5818 if (!isScalarEpilogueAllowed()) { 5819 LLVM_DEBUG( 5820 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5821 "allowed.\n";); 5822 return Result; 5823 } 5824 5825 // FIXME: This can be fixed for scalable vectors later, because at this stage 5826 // the LoopVectorizer will only consider vectorizing a loop with scalable 5827 // vectors when the loop has a hint to enable vectorization for a given VF. 5828 if (MainLoopVF.isScalable()) { 5829 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5830 "yet supported.\n"); 5831 return Result; 5832 } 5833 5834 // Not really a cost consideration, but check for unsupported cases here to 5835 // simplify the logic. 5836 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5837 LLVM_DEBUG( 5838 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5839 "not a supported candidate.\n";); 5840 return Result; 5841 } 5842 5843 if (EpilogueVectorizationForceVF > 1) { 5844 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5845 if (LVP.hasPlanWithVFs( 5846 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5847 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5848 else { 5849 LLVM_DEBUG( 5850 dbgs() 5851 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5852 return Result; 5853 } 5854 } 5855 5856 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5857 TheLoop->getHeader()->getParent()->hasMinSize()) { 5858 LLVM_DEBUG( 5859 dbgs() 5860 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5861 return Result; 5862 } 5863 5864 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5865 return Result; 5866 5867 for (auto &NextVF : ProfitableVFs) 5868 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5869 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5870 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5871 Result = NextVF; 5872 5873 if (Result != VectorizationFactor::Disabled()) 5874 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5875 << Result.Width.getFixedValue() << "\n";); 5876 return Result; 5877 } 5878 5879 std::pair<unsigned, unsigned> 5880 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5881 unsigned MinWidth = -1U; 5882 unsigned MaxWidth = 8; 5883 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5884 5885 // For each block. 5886 for (BasicBlock *BB : TheLoop->blocks()) { 5887 // For each instruction in the loop. 5888 for (Instruction &I : BB->instructionsWithoutDebug()) { 5889 Type *T = I.getType(); 5890 5891 // Skip ignored values. 5892 if (ValuesToIgnore.count(&I)) 5893 continue; 5894 5895 // Only examine Loads, Stores and PHINodes. 5896 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5897 continue; 5898 5899 // Examine PHI nodes that are reduction variables. Update the type to 5900 // account for the recurrence type. 5901 if (auto *PN = dyn_cast<PHINode>(&I)) { 5902 if (!Legal->isReductionVariable(PN)) 5903 continue; 5904 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5905 T = RdxDesc.getRecurrenceType(); 5906 } 5907 5908 // Examine the stored values. 5909 if (auto *ST = dyn_cast<StoreInst>(&I)) 5910 T = ST->getValueOperand()->getType(); 5911 5912 // Ignore loaded pointer types and stored pointer types that are not 5913 // vectorizable. 5914 // 5915 // FIXME: The check here attempts to predict whether a load or store will 5916 // be vectorized. We only know this for certain after a VF has 5917 // been selected. Here, we assume that if an access can be 5918 // vectorized, it will be. We should also look at extending this 5919 // optimization to non-pointer types. 5920 // 5921 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5922 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5923 continue; 5924 5925 MinWidth = std::min(MinWidth, 5926 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5927 MaxWidth = std::max(MaxWidth, 5928 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5929 } 5930 } 5931 5932 return {MinWidth, MaxWidth}; 5933 } 5934 5935 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5936 unsigned LoopCost) { 5937 // -- The interleave heuristics -- 5938 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5939 // There are many micro-architectural considerations that we can't predict 5940 // at this level. For example, frontend pressure (on decode or fetch) due to 5941 // code size, or the number and capabilities of the execution ports. 5942 // 5943 // We use the following heuristics to select the interleave count: 5944 // 1. If the code has reductions, then we interleave to break the cross 5945 // iteration dependency. 5946 // 2. If the loop is really small, then we interleave to reduce the loop 5947 // overhead. 5948 // 3. We don't interleave if we think that we will spill registers to memory 5949 // due to the increased register pressure. 5950 5951 if (!isScalarEpilogueAllowed()) 5952 return 1; 5953 5954 // We used the distance for the interleave count. 5955 if (Legal->getMaxSafeDepDistBytes() != -1U) 5956 return 1; 5957 5958 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5959 const bool HasReductions = !Legal->getReductionVars().empty(); 5960 // Do not interleave loops with a relatively small known or estimated trip 5961 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5962 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5963 // because with the above conditions interleaving can expose ILP and break 5964 // cross iteration dependences for reductions. 5965 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5966 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5967 return 1; 5968 5969 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5970 // We divide by these constants so assume that we have at least one 5971 // instruction that uses at least one register. 5972 for (auto& pair : R.MaxLocalUsers) { 5973 pair.second = std::max(pair.second, 1U); 5974 } 5975 5976 // We calculate the interleave count using the following formula. 5977 // Subtract the number of loop invariants from the number of available 5978 // registers. These registers are used by all of the interleaved instances. 5979 // Next, divide the remaining registers by the number of registers that is 5980 // required by the loop, in order to estimate how many parallel instances 5981 // fit without causing spills. All of this is rounded down if necessary to be 5982 // a power of two. We want power of two interleave count to simplify any 5983 // addressing operations or alignment considerations. 5984 // We also want power of two interleave counts to ensure that the induction 5985 // variable of the vector loop wraps to zero, when tail is folded by masking; 5986 // this currently happens when OptForSize, in which case IC is set to 1 above. 5987 unsigned IC = UINT_MAX; 5988 5989 for (auto& pair : R.MaxLocalUsers) { 5990 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5991 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5992 << " registers of " 5993 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5994 if (VF.isScalar()) { 5995 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5996 TargetNumRegisters = ForceTargetNumScalarRegs; 5997 } else { 5998 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5999 TargetNumRegisters = ForceTargetNumVectorRegs; 6000 } 6001 unsigned MaxLocalUsers = pair.second; 6002 unsigned LoopInvariantRegs = 0; 6003 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6004 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6005 6006 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6007 // Don't count the induction variable as interleaved. 6008 if (EnableIndVarRegisterHeur) { 6009 TmpIC = 6010 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6011 std::max(1U, (MaxLocalUsers - 1))); 6012 } 6013 6014 IC = std::min(IC, TmpIC); 6015 } 6016 6017 // Clamp the interleave ranges to reasonable counts. 6018 unsigned MaxInterleaveCount = 6019 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6020 6021 // Check if the user has overridden the max. 6022 if (VF.isScalar()) { 6023 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6024 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6025 } else { 6026 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6027 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6028 } 6029 6030 // If trip count is known or estimated compile time constant, limit the 6031 // interleave count to be less than the trip count divided by VF, provided it 6032 // is at least 1. 6033 // 6034 // For scalable vectors we can't know if interleaving is beneficial. It may 6035 // not be beneficial for small loops if none of the lanes in the second vector 6036 // iterations is enabled. However, for larger loops, there is likely to be a 6037 // similar benefit as for fixed-width vectors. For now, we choose to leave 6038 // the InterleaveCount as if vscale is '1', although if some information about 6039 // the vector is known (e.g. min vector size), we can make a better decision. 6040 if (BestKnownTC) { 6041 MaxInterleaveCount = 6042 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6043 // Make sure MaxInterleaveCount is greater than 0. 6044 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6045 } 6046 6047 assert(MaxInterleaveCount > 0 && 6048 "Maximum interleave count must be greater than 0"); 6049 6050 // Clamp the calculated IC to be between the 1 and the max interleave count 6051 // that the target and trip count allows. 6052 if (IC > MaxInterleaveCount) 6053 IC = MaxInterleaveCount; 6054 else 6055 // Make sure IC is greater than 0. 6056 IC = std::max(1u, IC); 6057 6058 assert(IC > 0 && "Interleave count must be greater than 0."); 6059 6060 // If we did not calculate the cost for VF (because the user selected the VF) 6061 // then we calculate the cost of VF here. 6062 if (LoopCost == 0) 6063 LoopCost = expectedCost(VF).first; 6064 6065 assert(LoopCost && "Non-zero loop cost expected"); 6066 6067 // Interleave if we vectorized this loop and there is a reduction that could 6068 // benefit from interleaving. 6069 if (VF.isVector() && HasReductions) { 6070 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6071 return IC; 6072 } 6073 6074 // Note that if we've already vectorized the loop we will have done the 6075 // runtime check and so interleaving won't require further checks. 6076 bool InterleavingRequiresRuntimePointerCheck = 6077 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6078 6079 // We want to interleave small loops in order to reduce the loop overhead and 6080 // potentially expose ILP opportunities. 6081 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6082 << "LV: IC is " << IC << '\n' 6083 << "LV: VF is " << VF << '\n'); 6084 const bool AggressivelyInterleaveReductions = 6085 TTI.enableAggressiveInterleaving(HasReductions); 6086 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6087 // We assume that the cost overhead is 1 and we use the cost model 6088 // to estimate the cost of the loop and interleave until the cost of the 6089 // loop overhead is about 5% of the cost of the loop. 6090 unsigned SmallIC = 6091 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6092 6093 // Interleave until store/load ports (estimated by max interleave count) are 6094 // saturated. 6095 unsigned NumStores = Legal->getNumStores(); 6096 unsigned NumLoads = Legal->getNumLoads(); 6097 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6098 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6099 6100 // If we have a scalar reduction (vector reductions are already dealt with 6101 // by this point), we can increase the critical path length if the loop 6102 // we're interleaving is inside another loop. Limit, by default to 2, so the 6103 // critical path only gets increased by one reduction operation. 6104 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6105 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6106 SmallIC = std::min(SmallIC, F); 6107 StoresIC = std::min(StoresIC, F); 6108 LoadsIC = std::min(LoadsIC, F); 6109 } 6110 6111 if (EnableLoadStoreRuntimeInterleave && 6112 std::max(StoresIC, LoadsIC) > SmallIC) { 6113 LLVM_DEBUG( 6114 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6115 return std::max(StoresIC, LoadsIC); 6116 } 6117 6118 // If there are scalar reductions and TTI has enabled aggressive 6119 // interleaving for reductions, we will interleave to expose ILP. 6120 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6121 AggressivelyInterleaveReductions) { 6122 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6123 // Interleave no less than SmallIC but not as aggressive as the normal IC 6124 // to satisfy the rare situation when resources are too limited. 6125 return std::max(IC / 2, SmallIC); 6126 } else { 6127 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6128 return SmallIC; 6129 } 6130 } 6131 6132 // Interleave if this is a large loop (small loops are already dealt with by 6133 // this point) that could benefit from interleaving. 6134 if (AggressivelyInterleaveReductions) { 6135 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6136 return IC; 6137 } 6138 6139 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6140 return 1; 6141 } 6142 6143 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6144 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6145 // This function calculates the register usage by measuring the highest number 6146 // of values that are alive at a single location. Obviously, this is a very 6147 // rough estimation. We scan the loop in a topological order in order and 6148 // assign a number to each instruction. We use RPO to ensure that defs are 6149 // met before their users. We assume that each instruction that has in-loop 6150 // users starts an interval. We record every time that an in-loop value is 6151 // used, so we have a list of the first and last occurrences of each 6152 // instruction. Next, we transpose this data structure into a multi map that 6153 // holds the list of intervals that *end* at a specific location. This multi 6154 // map allows us to perform a linear search. We scan the instructions linearly 6155 // and record each time that a new interval starts, by placing it in a set. 6156 // If we find this value in the multi-map then we remove it from the set. 6157 // The max register usage is the maximum size of the set. 6158 // We also search for instructions that are defined outside the loop, but are 6159 // used inside the loop. We need this number separately from the max-interval 6160 // usage number because when we unroll, loop-invariant values do not take 6161 // more register. 6162 LoopBlocksDFS DFS(TheLoop); 6163 DFS.perform(LI); 6164 6165 RegisterUsage RU; 6166 6167 // Each 'key' in the map opens a new interval. The values 6168 // of the map are the index of the 'last seen' usage of the 6169 // instruction that is the key. 6170 using IntervalMap = DenseMap<Instruction *, unsigned>; 6171 6172 // Maps instruction to its index. 6173 SmallVector<Instruction *, 64> IdxToInstr; 6174 // Marks the end of each interval. 6175 IntervalMap EndPoint; 6176 // Saves the list of instruction indices that are used in the loop. 6177 SmallPtrSet<Instruction *, 8> Ends; 6178 // Saves the list of values that are used in the loop but are 6179 // defined outside the loop, such as arguments and constants. 6180 SmallPtrSet<Value *, 8> LoopInvariants; 6181 6182 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6183 for (Instruction &I : BB->instructionsWithoutDebug()) { 6184 IdxToInstr.push_back(&I); 6185 6186 // Save the end location of each USE. 6187 for (Value *U : I.operands()) { 6188 auto *Instr = dyn_cast<Instruction>(U); 6189 6190 // Ignore non-instruction values such as arguments, constants, etc. 6191 if (!Instr) 6192 continue; 6193 6194 // If this instruction is outside the loop then record it and continue. 6195 if (!TheLoop->contains(Instr)) { 6196 LoopInvariants.insert(Instr); 6197 continue; 6198 } 6199 6200 // Overwrite previous end points. 6201 EndPoint[Instr] = IdxToInstr.size(); 6202 Ends.insert(Instr); 6203 } 6204 } 6205 } 6206 6207 // Saves the list of intervals that end with the index in 'key'. 6208 using InstrList = SmallVector<Instruction *, 2>; 6209 DenseMap<unsigned, InstrList> TransposeEnds; 6210 6211 // Transpose the EndPoints to a list of values that end at each index. 6212 for (auto &Interval : EndPoint) 6213 TransposeEnds[Interval.second].push_back(Interval.first); 6214 6215 SmallPtrSet<Instruction *, 8> OpenIntervals; 6216 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6217 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6218 6219 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6220 6221 // A lambda that gets the register usage for the given type and VF. 6222 const auto &TTICapture = TTI; 6223 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6224 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6225 return 0U; 6226 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6227 }; 6228 6229 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6230 Instruction *I = IdxToInstr[i]; 6231 6232 // Remove all of the instructions that end at this location. 6233 InstrList &List = TransposeEnds[i]; 6234 for (Instruction *ToRemove : List) 6235 OpenIntervals.erase(ToRemove); 6236 6237 // Ignore instructions that are never used within the loop. 6238 if (!Ends.count(I)) 6239 continue; 6240 6241 // Skip ignored values. 6242 if (ValuesToIgnore.count(I)) 6243 continue; 6244 6245 // For each VF find the maximum usage of registers. 6246 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6247 // Count the number of live intervals. 6248 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6249 6250 if (VFs[j].isScalar()) { 6251 for (auto Inst : OpenIntervals) { 6252 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6253 if (RegUsage.find(ClassID) == RegUsage.end()) 6254 RegUsage[ClassID] = 1; 6255 else 6256 RegUsage[ClassID] += 1; 6257 } 6258 } else { 6259 collectUniformsAndScalars(VFs[j]); 6260 for (auto Inst : OpenIntervals) { 6261 // Skip ignored values for VF > 1. 6262 if (VecValuesToIgnore.count(Inst)) 6263 continue; 6264 if (isScalarAfterVectorization(Inst, VFs[j])) { 6265 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6266 if (RegUsage.find(ClassID) == RegUsage.end()) 6267 RegUsage[ClassID] = 1; 6268 else 6269 RegUsage[ClassID] += 1; 6270 } else { 6271 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6272 if (RegUsage.find(ClassID) == RegUsage.end()) 6273 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6274 else 6275 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6276 } 6277 } 6278 } 6279 6280 for (auto& pair : RegUsage) { 6281 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6282 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6283 else 6284 MaxUsages[j][pair.first] = pair.second; 6285 } 6286 } 6287 6288 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6289 << OpenIntervals.size() << '\n'); 6290 6291 // Add the current instruction to the list of open intervals. 6292 OpenIntervals.insert(I); 6293 } 6294 6295 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6296 SmallMapVector<unsigned, unsigned, 4> Invariant; 6297 6298 for (auto Inst : LoopInvariants) { 6299 unsigned Usage = 6300 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6301 unsigned ClassID = 6302 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6303 if (Invariant.find(ClassID) == Invariant.end()) 6304 Invariant[ClassID] = Usage; 6305 else 6306 Invariant[ClassID] += Usage; 6307 } 6308 6309 LLVM_DEBUG({ 6310 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6311 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6312 << " item\n"; 6313 for (const auto &pair : MaxUsages[i]) { 6314 dbgs() << "LV(REG): RegisterClass: " 6315 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6316 << " registers\n"; 6317 } 6318 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6319 << " item\n"; 6320 for (const auto &pair : Invariant) { 6321 dbgs() << "LV(REG): RegisterClass: " 6322 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6323 << " registers\n"; 6324 } 6325 }); 6326 6327 RU.LoopInvariantRegs = Invariant; 6328 RU.MaxLocalUsers = MaxUsages[i]; 6329 RUs[i] = RU; 6330 } 6331 6332 return RUs; 6333 } 6334 6335 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6336 // TODO: Cost model for emulated masked load/store is completely 6337 // broken. This hack guides the cost model to use an artificially 6338 // high enough value to practically disable vectorization with such 6339 // operations, except where previously deployed legality hack allowed 6340 // using very low cost values. This is to avoid regressions coming simply 6341 // from moving "masked load/store" check from legality to cost model. 6342 // Masked Load/Gather emulation was previously never allowed. 6343 // Limited number of Masked Store/Scatter emulation was allowed. 6344 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6345 return isa<LoadInst>(I) || 6346 (isa<StoreInst>(I) && 6347 NumPredStores > NumberOfStoresToPredicate); 6348 } 6349 6350 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6351 // If we aren't vectorizing the loop, or if we've already collected the 6352 // instructions to scalarize, there's nothing to do. Collection may already 6353 // have occurred if we have a user-selected VF and are now computing the 6354 // expected cost for interleaving. 6355 if (VF.isScalar() || VF.isZero() || 6356 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6357 return; 6358 6359 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6360 // not profitable to scalarize any instructions, the presence of VF in the 6361 // map will indicate that we've analyzed it already. 6362 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6363 6364 // Find all the instructions that are scalar with predication in the loop and 6365 // determine if it would be better to not if-convert the blocks they are in. 6366 // If so, we also record the instructions to scalarize. 6367 for (BasicBlock *BB : TheLoop->blocks()) { 6368 if (!blockNeedsPredication(BB)) 6369 continue; 6370 for (Instruction &I : *BB) 6371 if (isScalarWithPredication(&I)) { 6372 ScalarCostsTy ScalarCosts; 6373 // Do not apply discount logic if hacked cost is needed 6374 // for emulated masked memrefs. 6375 if (!useEmulatedMaskMemRefHack(&I) && 6376 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6377 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6378 // Remember that BB will remain after vectorization. 6379 PredicatedBBsAfterVectorization.insert(BB); 6380 } 6381 } 6382 } 6383 6384 int LoopVectorizationCostModel::computePredInstDiscount( 6385 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6386 ElementCount VF) { 6387 assert(!isUniformAfterVectorization(PredInst, VF) && 6388 "Instruction marked uniform-after-vectorization will be predicated"); 6389 6390 // Initialize the discount to zero, meaning that the scalar version and the 6391 // vector version cost the same. 6392 int Discount = 0; 6393 6394 // Holds instructions to analyze. The instructions we visit are mapped in 6395 // ScalarCosts. Those instructions are the ones that would be scalarized if 6396 // we find that the scalar version costs less. 6397 SmallVector<Instruction *, 8> Worklist; 6398 6399 // Returns true if the given instruction can be scalarized. 6400 auto canBeScalarized = [&](Instruction *I) -> bool { 6401 // We only attempt to scalarize instructions forming a single-use chain 6402 // from the original predicated block that would otherwise be vectorized. 6403 // Although not strictly necessary, we give up on instructions we know will 6404 // already be scalar to avoid traversing chains that are unlikely to be 6405 // beneficial. 6406 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6407 isScalarAfterVectorization(I, VF)) 6408 return false; 6409 6410 // If the instruction is scalar with predication, it will be analyzed 6411 // separately. We ignore it within the context of PredInst. 6412 if (isScalarWithPredication(I)) 6413 return false; 6414 6415 // If any of the instruction's operands are uniform after vectorization, 6416 // the instruction cannot be scalarized. This prevents, for example, a 6417 // masked load from being scalarized. 6418 // 6419 // We assume we will only emit a value for lane zero of an instruction 6420 // marked uniform after vectorization, rather than VF identical values. 6421 // Thus, if we scalarize an instruction that uses a uniform, we would 6422 // create uses of values corresponding to the lanes we aren't emitting code 6423 // for. This behavior can be changed by allowing getScalarValue to clone 6424 // the lane zero values for uniforms rather than asserting. 6425 for (Use &U : I->operands()) 6426 if (auto *J = dyn_cast<Instruction>(U.get())) 6427 if (isUniformAfterVectorization(J, VF)) 6428 return false; 6429 6430 // Otherwise, we can scalarize the instruction. 6431 return true; 6432 }; 6433 6434 // Compute the expected cost discount from scalarizing the entire expression 6435 // feeding the predicated instruction. We currently only consider expressions 6436 // that are single-use instruction chains. 6437 Worklist.push_back(PredInst); 6438 while (!Worklist.empty()) { 6439 Instruction *I = Worklist.pop_back_val(); 6440 6441 // If we've already analyzed the instruction, there's nothing to do. 6442 if (ScalarCosts.find(I) != ScalarCosts.end()) 6443 continue; 6444 6445 // Compute the cost of the vector instruction. Note that this cost already 6446 // includes the scalarization overhead of the predicated instruction. 6447 unsigned VectorCost = getInstructionCost(I, VF).first; 6448 6449 // Compute the cost of the scalarized instruction. This cost is the cost of 6450 // the instruction as if it wasn't if-converted and instead remained in the 6451 // predicated block. We will scale this cost by block probability after 6452 // computing the scalarization overhead. 6453 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6454 unsigned ScalarCost = 6455 VF.getKnownMinValue() * 6456 getInstructionCost(I, ElementCount::getFixed(1)).first; 6457 6458 // Compute the scalarization overhead of needed insertelement instructions 6459 // and phi nodes. 6460 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6461 ScalarCost += TTI.getScalarizationOverhead( 6462 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6463 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6464 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6465 ScalarCost += 6466 VF.getKnownMinValue() * 6467 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6468 } 6469 6470 // Compute the scalarization overhead of needed extractelement 6471 // instructions. For each of the instruction's operands, if the operand can 6472 // be scalarized, add it to the worklist; otherwise, account for the 6473 // overhead. 6474 for (Use &U : I->operands()) 6475 if (auto *J = dyn_cast<Instruction>(U.get())) { 6476 assert(VectorType::isValidElementType(J->getType()) && 6477 "Instruction has non-scalar type"); 6478 if (canBeScalarized(J)) 6479 Worklist.push_back(J); 6480 else if (needsExtract(J, VF)) { 6481 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6482 ScalarCost += TTI.getScalarizationOverhead( 6483 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6484 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6485 } 6486 } 6487 6488 // Scale the total scalar cost by block probability. 6489 ScalarCost /= getReciprocalPredBlockProb(); 6490 6491 // Compute the discount. A non-negative discount means the vector version 6492 // of the instruction costs more, and scalarizing would be beneficial. 6493 Discount += VectorCost - ScalarCost; 6494 ScalarCosts[I] = ScalarCost; 6495 } 6496 6497 return Discount; 6498 } 6499 6500 LoopVectorizationCostModel::VectorizationCostTy 6501 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6502 VectorizationCostTy Cost; 6503 6504 // For each block. 6505 for (BasicBlock *BB : TheLoop->blocks()) { 6506 VectorizationCostTy BlockCost; 6507 6508 // For each instruction in the old loop. 6509 for (Instruction &I : BB->instructionsWithoutDebug()) { 6510 // Skip ignored values. 6511 if (ValuesToIgnore.count(&I) || 6512 (VF.isVector() && VecValuesToIgnore.count(&I))) 6513 continue; 6514 6515 VectorizationCostTy C = getInstructionCost(&I, VF); 6516 6517 // Check if we should override the cost. 6518 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6519 C.first = ForceTargetInstructionCost; 6520 6521 BlockCost.first += C.first; 6522 BlockCost.second |= C.second; 6523 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6524 << " for VF " << VF << " For instruction: " << I 6525 << '\n'); 6526 } 6527 6528 // If we are vectorizing a predicated block, it will have been 6529 // if-converted. This means that the block's instructions (aside from 6530 // stores and instructions that may divide by zero) will now be 6531 // unconditionally executed. For the scalar case, we may not always execute 6532 // the predicated block, if it is an if-else block. Thus, scale the block's 6533 // cost by the probability of executing it. blockNeedsPredication from 6534 // Legal is used so as to not include all blocks in tail folded loops. 6535 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6536 BlockCost.first /= getReciprocalPredBlockProb(); 6537 6538 Cost.first += BlockCost.first; 6539 Cost.second |= BlockCost.second; 6540 } 6541 6542 return Cost; 6543 } 6544 6545 /// Gets Address Access SCEV after verifying that the access pattern 6546 /// is loop invariant except the induction variable dependence. 6547 /// 6548 /// This SCEV can be sent to the Target in order to estimate the address 6549 /// calculation cost. 6550 static const SCEV *getAddressAccessSCEV( 6551 Value *Ptr, 6552 LoopVectorizationLegality *Legal, 6553 PredicatedScalarEvolution &PSE, 6554 const Loop *TheLoop) { 6555 6556 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6557 if (!Gep) 6558 return nullptr; 6559 6560 // We are looking for a gep with all loop invariant indices except for one 6561 // which should be an induction variable. 6562 auto SE = PSE.getSE(); 6563 unsigned NumOperands = Gep->getNumOperands(); 6564 for (unsigned i = 1; i < NumOperands; ++i) { 6565 Value *Opd = Gep->getOperand(i); 6566 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6567 !Legal->isInductionVariable(Opd)) 6568 return nullptr; 6569 } 6570 6571 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6572 return PSE.getSCEV(Ptr); 6573 } 6574 6575 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6576 return Legal->hasStride(I->getOperand(0)) || 6577 Legal->hasStride(I->getOperand(1)); 6578 } 6579 6580 unsigned 6581 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6582 ElementCount VF) { 6583 assert(VF.isVector() && 6584 "Scalarization cost of instruction implies vectorization."); 6585 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6586 Type *ValTy = getMemInstValueType(I); 6587 auto SE = PSE.getSE(); 6588 6589 unsigned AS = getLoadStoreAddressSpace(I); 6590 Value *Ptr = getLoadStorePointerOperand(I); 6591 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6592 6593 // Figure out whether the access is strided and get the stride value 6594 // if it's known in compile time 6595 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6596 6597 // Get the cost of the scalar memory instruction and address computation. 6598 unsigned Cost = 6599 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6600 6601 // Don't pass *I here, since it is scalar but will actually be part of a 6602 // vectorized loop where the user of it is a vectorized instruction. 6603 const Align Alignment = getLoadStoreAlignment(I); 6604 Cost += VF.getKnownMinValue() * 6605 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6606 AS, TTI::TCK_RecipThroughput); 6607 6608 // Get the overhead of the extractelement and insertelement instructions 6609 // we might create due to scalarization. 6610 Cost += getScalarizationOverhead(I, VF); 6611 6612 // If we have a predicated store, it may not be executed for each vector 6613 // lane. Scale the cost by the probability of executing the predicated 6614 // block. 6615 if (isPredicatedInst(I)) { 6616 Cost /= getReciprocalPredBlockProb(); 6617 6618 if (useEmulatedMaskMemRefHack(I)) 6619 // Artificially setting to a high enough value to practically disable 6620 // vectorization with such operations. 6621 Cost = 3000000; 6622 } 6623 6624 return Cost; 6625 } 6626 6627 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6628 ElementCount VF) { 6629 Type *ValTy = getMemInstValueType(I); 6630 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6631 Value *Ptr = getLoadStorePointerOperand(I); 6632 unsigned AS = getLoadStoreAddressSpace(I); 6633 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6634 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6635 6636 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6637 "Stride should be 1 or -1 for consecutive memory access"); 6638 const Align Alignment = getLoadStoreAlignment(I); 6639 unsigned Cost = 0; 6640 if (Legal->isMaskRequired(I)) 6641 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6642 CostKind); 6643 else 6644 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6645 CostKind, I); 6646 6647 bool Reverse = ConsecutiveStride < 0; 6648 if (Reverse) 6649 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6650 return Cost; 6651 } 6652 6653 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6654 ElementCount VF) { 6655 assert(Legal->isUniformMemOp(*I)); 6656 6657 Type *ValTy = getMemInstValueType(I); 6658 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6659 const Align Alignment = getLoadStoreAlignment(I); 6660 unsigned AS = getLoadStoreAddressSpace(I); 6661 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6662 if (isa<LoadInst>(I)) { 6663 return TTI.getAddressComputationCost(ValTy) + 6664 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6665 CostKind) + 6666 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6667 } 6668 StoreInst *SI = cast<StoreInst>(I); 6669 6670 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6671 return TTI.getAddressComputationCost(ValTy) + 6672 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6673 CostKind) + 6674 (isLoopInvariantStoreValue 6675 ? 0 6676 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6677 VF.getKnownMinValue() - 1)); 6678 } 6679 6680 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6681 ElementCount VF) { 6682 Type *ValTy = getMemInstValueType(I); 6683 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6684 const Align Alignment = getLoadStoreAlignment(I); 6685 const Value *Ptr = getLoadStorePointerOperand(I); 6686 6687 return TTI.getAddressComputationCost(VectorTy) + 6688 TTI.getGatherScatterOpCost( 6689 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6690 TargetTransformInfo::TCK_RecipThroughput, I); 6691 } 6692 6693 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6694 ElementCount VF) { 6695 Type *ValTy = getMemInstValueType(I); 6696 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6697 unsigned AS = getLoadStoreAddressSpace(I); 6698 6699 auto Group = getInterleavedAccessGroup(I); 6700 assert(Group && "Fail to get an interleaved access group."); 6701 6702 unsigned InterleaveFactor = Group->getFactor(); 6703 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6704 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6705 6706 // Holds the indices of existing members in an interleaved load group. 6707 // An interleaved store group doesn't need this as it doesn't allow gaps. 6708 SmallVector<unsigned, 4> Indices; 6709 if (isa<LoadInst>(I)) { 6710 for (unsigned i = 0; i < InterleaveFactor; i++) 6711 if (Group->getMember(i)) 6712 Indices.push_back(i); 6713 } 6714 6715 // Calculate the cost of the whole interleaved group. 6716 bool UseMaskForGaps = 6717 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6718 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6719 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6720 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6721 6722 if (Group->isReverse()) { 6723 // TODO: Add support for reversed masked interleaved access. 6724 assert(!Legal->isMaskRequired(I) && 6725 "Reverse masked interleaved access not supported."); 6726 Cost += Group->getNumMembers() * 6727 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6728 } 6729 return Cost; 6730 } 6731 6732 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6733 ElementCount VF) { 6734 // Calculate scalar cost only. Vectorization cost should be ready at this 6735 // moment. 6736 if (VF.isScalar()) { 6737 Type *ValTy = getMemInstValueType(I); 6738 const Align Alignment = getLoadStoreAlignment(I); 6739 unsigned AS = getLoadStoreAddressSpace(I); 6740 6741 return TTI.getAddressComputationCost(ValTy) + 6742 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6743 TTI::TCK_RecipThroughput, I); 6744 } 6745 return getWideningCost(I, VF); 6746 } 6747 6748 LoopVectorizationCostModel::VectorizationCostTy 6749 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6750 ElementCount VF) { 6751 // If we know that this instruction will remain uniform, check the cost of 6752 // the scalar version. 6753 if (isUniformAfterVectorization(I, VF)) 6754 VF = ElementCount::getFixed(1); 6755 6756 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6757 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6758 6759 // Forced scalars do not have any scalarization overhead. 6760 auto ForcedScalar = ForcedScalars.find(VF); 6761 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6762 auto InstSet = ForcedScalar->second; 6763 if (InstSet.count(I)) 6764 return VectorizationCostTy( 6765 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6766 VF.getKnownMinValue()), 6767 false); 6768 } 6769 6770 Type *VectorTy; 6771 unsigned C = getInstructionCost(I, VF, VectorTy); 6772 6773 bool TypeNotScalarized = 6774 VF.isVector() && VectorTy->isVectorTy() && 6775 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6776 return VectorizationCostTy(C, TypeNotScalarized); 6777 } 6778 6779 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6780 ElementCount VF) { 6781 6782 assert(!VF.isScalable() && 6783 "cannot compute scalarization overhead for scalable vectorization"); 6784 if (VF.isScalar()) 6785 return 0; 6786 6787 unsigned Cost = 0; 6788 Type *RetTy = ToVectorTy(I->getType(), VF); 6789 if (!RetTy->isVoidTy() && 6790 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6791 Cost += TTI.getScalarizationOverhead( 6792 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6793 true, false); 6794 6795 // Some targets keep addresses scalar. 6796 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6797 return Cost; 6798 6799 // Some targets support efficient element stores. 6800 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6801 return Cost; 6802 6803 // Collect operands to consider. 6804 CallInst *CI = dyn_cast<CallInst>(I); 6805 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6806 6807 // Skip operands that do not require extraction/scalarization and do not incur 6808 // any overhead. 6809 return Cost + TTI.getOperandsScalarizationOverhead( 6810 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6811 } 6812 6813 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6814 if (VF.isScalar()) 6815 return; 6816 NumPredStores = 0; 6817 for (BasicBlock *BB : TheLoop->blocks()) { 6818 // For each instruction in the old loop. 6819 for (Instruction &I : *BB) { 6820 Value *Ptr = getLoadStorePointerOperand(&I); 6821 if (!Ptr) 6822 continue; 6823 6824 // TODO: We should generate better code and update the cost model for 6825 // predicated uniform stores. Today they are treated as any other 6826 // predicated store (see added test cases in 6827 // invariant-store-vectorization.ll). 6828 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6829 NumPredStores++; 6830 6831 if (Legal->isUniformMemOp(I)) { 6832 // TODO: Avoid replicating loads and stores instead of 6833 // relying on instcombine to remove them. 6834 // Load: Scalar load + broadcast 6835 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6836 unsigned Cost = getUniformMemOpCost(&I, VF); 6837 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6838 continue; 6839 } 6840 6841 // We assume that widening is the best solution when possible. 6842 if (memoryInstructionCanBeWidened(&I, VF)) { 6843 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6844 int ConsecutiveStride = 6845 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6846 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6847 "Expected consecutive stride."); 6848 InstWidening Decision = 6849 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6850 setWideningDecision(&I, VF, Decision, Cost); 6851 continue; 6852 } 6853 6854 // Choose between Interleaving, Gather/Scatter or Scalarization. 6855 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6856 unsigned NumAccesses = 1; 6857 if (isAccessInterleaved(&I)) { 6858 auto Group = getInterleavedAccessGroup(&I); 6859 assert(Group && "Fail to get an interleaved access group."); 6860 6861 // Make one decision for the whole group. 6862 if (getWideningDecision(&I, VF) != CM_Unknown) 6863 continue; 6864 6865 NumAccesses = Group->getNumMembers(); 6866 if (interleavedAccessCanBeWidened(&I, VF)) 6867 InterleaveCost = getInterleaveGroupCost(&I, VF); 6868 } 6869 6870 unsigned GatherScatterCost = 6871 isLegalGatherOrScatter(&I) 6872 ? getGatherScatterCost(&I, VF) * NumAccesses 6873 : std::numeric_limits<unsigned>::max(); 6874 6875 unsigned ScalarizationCost = 6876 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6877 6878 // Choose better solution for the current VF, 6879 // write down this decision and use it during vectorization. 6880 unsigned Cost; 6881 InstWidening Decision; 6882 if (InterleaveCost <= GatherScatterCost && 6883 InterleaveCost < ScalarizationCost) { 6884 Decision = CM_Interleave; 6885 Cost = InterleaveCost; 6886 } else if (GatherScatterCost < ScalarizationCost) { 6887 Decision = CM_GatherScatter; 6888 Cost = GatherScatterCost; 6889 } else { 6890 Decision = CM_Scalarize; 6891 Cost = ScalarizationCost; 6892 } 6893 // If the instructions belongs to an interleave group, the whole group 6894 // receives the same decision. The whole group receives the cost, but 6895 // the cost will actually be assigned to one instruction. 6896 if (auto Group = getInterleavedAccessGroup(&I)) 6897 setWideningDecision(Group, VF, Decision, Cost); 6898 else 6899 setWideningDecision(&I, VF, Decision, Cost); 6900 } 6901 } 6902 6903 // Make sure that any load of address and any other address computation 6904 // remains scalar unless there is gather/scatter support. This avoids 6905 // inevitable extracts into address registers, and also has the benefit of 6906 // activating LSR more, since that pass can't optimize vectorized 6907 // addresses. 6908 if (TTI.prefersVectorizedAddressing()) 6909 return; 6910 6911 // Start with all scalar pointer uses. 6912 SmallPtrSet<Instruction *, 8> AddrDefs; 6913 for (BasicBlock *BB : TheLoop->blocks()) 6914 for (Instruction &I : *BB) { 6915 Instruction *PtrDef = 6916 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6917 if (PtrDef && TheLoop->contains(PtrDef) && 6918 getWideningDecision(&I, VF) != CM_GatherScatter) 6919 AddrDefs.insert(PtrDef); 6920 } 6921 6922 // Add all instructions used to generate the addresses. 6923 SmallVector<Instruction *, 4> Worklist; 6924 for (auto *I : AddrDefs) 6925 Worklist.push_back(I); 6926 while (!Worklist.empty()) { 6927 Instruction *I = Worklist.pop_back_val(); 6928 for (auto &Op : I->operands()) 6929 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6930 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6931 AddrDefs.insert(InstOp).second) 6932 Worklist.push_back(InstOp); 6933 } 6934 6935 for (auto *I : AddrDefs) { 6936 if (isa<LoadInst>(I)) { 6937 // Setting the desired widening decision should ideally be handled in 6938 // by cost functions, but since this involves the task of finding out 6939 // if the loaded register is involved in an address computation, it is 6940 // instead changed here when we know this is the case. 6941 InstWidening Decision = getWideningDecision(I, VF); 6942 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6943 // Scalarize a widened load of address. 6944 setWideningDecision( 6945 I, VF, CM_Scalarize, 6946 (VF.getKnownMinValue() * 6947 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6948 else if (auto Group = getInterleavedAccessGroup(I)) { 6949 // Scalarize an interleave group of address loads. 6950 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6951 if (Instruction *Member = Group->getMember(I)) 6952 setWideningDecision( 6953 Member, VF, CM_Scalarize, 6954 (VF.getKnownMinValue() * 6955 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6956 } 6957 } 6958 } else 6959 // Make sure I gets scalarized and a cost estimate without 6960 // scalarization overhead. 6961 ForcedScalars[VF].insert(I); 6962 } 6963 } 6964 6965 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6966 ElementCount VF, 6967 Type *&VectorTy) { 6968 Type *RetTy = I->getType(); 6969 if (canTruncateToMinimalBitwidth(I, VF)) 6970 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6971 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6972 auto SE = PSE.getSE(); 6973 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6974 6975 // TODO: We need to estimate the cost of intrinsic calls. 6976 switch (I->getOpcode()) { 6977 case Instruction::GetElementPtr: 6978 // We mark this instruction as zero-cost because the cost of GEPs in 6979 // vectorized code depends on whether the corresponding memory instruction 6980 // is scalarized or not. Therefore, we handle GEPs with the memory 6981 // instruction cost. 6982 return 0; 6983 case Instruction::Br: { 6984 // In cases of scalarized and predicated instructions, there will be VF 6985 // predicated blocks in the vectorized loop. Each branch around these 6986 // blocks requires also an extract of its vector compare i1 element. 6987 bool ScalarPredicatedBB = false; 6988 BranchInst *BI = cast<BranchInst>(I); 6989 if (VF.isVector() && BI->isConditional() && 6990 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6991 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6992 ScalarPredicatedBB = true; 6993 6994 if (ScalarPredicatedBB) { 6995 // Return cost for branches around scalarized and predicated blocks. 6996 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6997 auto *Vec_i1Ty = 6998 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6999 return (TTI.getScalarizationOverhead( 7000 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7001 false, true) + 7002 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7003 VF.getKnownMinValue())); 7004 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7005 // The back-edge branch will remain, as will all scalar branches. 7006 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7007 else 7008 // This branch will be eliminated by if-conversion. 7009 return 0; 7010 // Note: We currently assume zero cost for an unconditional branch inside 7011 // a predicated block since it will become a fall-through, although we 7012 // may decide in the future to call TTI for all branches. 7013 } 7014 case Instruction::PHI: { 7015 auto *Phi = cast<PHINode>(I); 7016 7017 // First-order recurrences are replaced by vector shuffles inside the loop. 7018 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7019 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7020 return TTI.getShuffleCost( 7021 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7022 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7023 7024 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7025 // converted into select instructions. We require N - 1 selects per phi 7026 // node, where N is the number of incoming values. 7027 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7028 return (Phi->getNumIncomingValues() - 1) * 7029 TTI.getCmpSelInstrCost( 7030 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7031 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7032 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7033 7034 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7035 } 7036 case Instruction::UDiv: 7037 case Instruction::SDiv: 7038 case Instruction::URem: 7039 case Instruction::SRem: 7040 // If we have a predicated instruction, it may not be executed for each 7041 // vector lane. Get the scalarization cost and scale this amount by the 7042 // probability of executing the predicated block. If the instruction is not 7043 // predicated, we fall through to the next case. 7044 if (VF.isVector() && isScalarWithPredication(I)) { 7045 unsigned Cost = 0; 7046 7047 // These instructions have a non-void type, so account for the phi nodes 7048 // that we will create. This cost is likely to be zero. The phi node 7049 // cost, if any, should be scaled by the block probability because it 7050 // models a copy at the end of each predicated block. 7051 Cost += VF.getKnownMinValue() * 7052 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7053 7054 // The cost of the non-predicated instruction. 7055 Cost += VF.getKnownMinValue() * 7056 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7057 7058 // The cost of insertelement and extractelement instructions needed for 7059 // scalarization. 7060 Cost += getScalarizationOverhead(I, VF); 7061 7062 // Scale the cost by the probability of executing the predicated blocks. 7063 // This assumes the predicated block for each vector lane is equally 7064 // likely. 7065 return Cost / getReciprocalPredBlockProb(); 7066 } 7067 LLVM_FALLTHROUGH; 7068 case Instruction::Add: 7069 case Instruction::FAdd: 7070 case Instruction::Sub: 7071 case Instruction::FSub: 7072 case Instruction::Mul: 7073 case Instruction::FMul: 7074 case Instruction::FDiv: 7075 case Instruction::FRem: 7076 case Instruction::Shl: 7077 case Instruction::LShr: 7078 case Instruction::AShr: 7079 case Instruction::And: 7080 case Instruction::Or: 7081 case Instruction::Xor: { 7082 // Since we will replace the stride by 1 the multiplication should go away. 7083 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7084 return 0; 7085 // Certain instructions can be cheaper to vectorize if they have a constant 7086 // second vector operand. One example of this are shifts on x86. 7087 Value *Op2 = I->getOperand(1); 7088 TargetTransformInfo::OperandValueProperties Op2VP; 7089 TargetTransformInfo::OperandValueKind Op2VK = 7090 TTI.getOperandInfo(Op2, Op2VP); 7091 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7092 Op2VK = TargetTransformInfo::OK_UniformValue; 7093 7094 SmallVector<const Value *, 4> Operands(I->operand_values()); 7095 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7096 return N * TTI.getArithmeticInstrCost( 7097 I->getOpcode(), VectorTy, CostKind, 7098 TargetTransformInfo::OK_AnyValue, 7099 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7100 } 7101 case Instruction::FNeg: { 7102 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7103 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7104 return N * TTI.getArithmeticInstrCost( 7105 I->getOpcode(), VectorTy, CostKind, 7106 TargetTransformInfo::OK_AnyValue, 7107 TargetTransformInfo::OK_AnyValue, 7108 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7109 I->getOperand(0), I); 7110 } 7111 case Instruction::Select: { 7112 SelectInst *SI = cast<SelectInst>(I); 7113 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7114 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7115 Type *CondTy = SI->getCondition()->getType(); 7116 if (!ScalarCond) { 7117 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7118 CondTy = VectorType::get(CondTy, VF); 7119 } 7120 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7121 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7122 } 7123 case Instruction::ICmp: 7124 case Instruction::FCmp: { 7125 Type *ValTy = I->getOperand(0)->getType(); 7126 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7127 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7128 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7129 VectorTy = ToVectorTy(ValTy, VF); 7130 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7131 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7132 } 7133 case Instruction::Store: 7134 case Instruction::Load: { 7135 ElementCount Width = VF; 7136 if (Width.isVector()) { 7137 InstWidening Decision = getWideningDecision(I, Width); 7138 assert(Decision != CM_Unknown && 7139 "CM decision should be taken at this point"); 7140 if (Decision == CM_Scalarize) 7141 Width = ElementCount::getFixed(1); 7142 } 7143 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7144 return getMemoryInstructionCost(I, VF); 7145 } 7146 case Instruction::ZExt: 7147 case Instruction::SExt: 7148 case Instruction::FPToUI: 7149 case Instruction::FPToSI: 7150 case Instruction::FPExt: 7151 case Instruction::PtrToInt: 7152 case Instruction::IntToPtr: 7153 case Instruction::SIToFP: 7154 case Instruction::UIToFP: 7155 case Instruction::Trunc: 7156 case Instruction::FPTrunc: 7157 case Instruction::BitCast: { 7158 // Computes the CastContextHint from a Load/Store instruction. 7159 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7160 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7161 "Expected a load or a store!"); 7162 7163 if (VF.isScalar() || !TheLoop->contains(I)) 7164 return TTI::CastContextHint::Normal; 7165 7166 switch (getWideningDecision(I, VF)) { 7167 case LoopVectorizationCostModel::CM_GatherScatter: 7168 return TTI::CastContextHint::GatherScatter; 7169 case LoopVectorizationCostModel::CM_Interleave: 7170 return TTI::CastContextHint::Interleave; 7171 case LoopVectorizationCostModel::CM_Scalarize: 7172 case LoopVectorizationCostModel::CM_Widen: 7173 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7174 : TTI::CastContextHint::Normal; 7175 case LoopVectorizationCostModel::CM_Widen_Reverse: 7176 return TTI::CastContextHint::Reversed; 7177 case LoopVectorizationCostModel::CM_Unknown: 7178 llvm_unreachable("Instr did not go through cost modelling?"); 7179 } 7180 7181 llvm_unreachable("Unhandled case!"); 7182 }; 7183 7184 unsigned Opcode = I->getOpcode(); 7185 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7186 // For Trunc, the context is the only user, which must be a StoreInst. 7187 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7188 if (I->hasOneUse()) 7189 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7190 CCH = ComputeCCH(Store); 7191 } 7192 // For Z/Sext, the context is the operand, which must be a LoadInst. 7193 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7194 Opcode == Instruction::FPExt) { 7195 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7196 CCH = ComputeCCH(Load); 7197 } 7198 7199 // We optimize the truncation of induction variables having constant 7200 // integer steps. The cost of these truncations is the same as the scalar 7201 // operation. 7202 if (isOptimizableIVTruncate(I, VF)) { 7203 auto *Trunc = cast<TruncInst>(I); 7204 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7205 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7206 } 7207 7208 Type *SrcScalarTy = I->getOperand(0)->getType(); 7209 Type *SrcVecTy = 7210 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7211 if (canTruncateToMinimalBitwidth(I, VF)) { 7212 // This cast is going to be shrunk. This may remove the cast or it might 7213 // turn it into slightly different cast. For example, if MinBW == 16, 7214 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7215 // 7216 // Calculate the modified src and dest types. 7217 Type *MinVecTy = VectorTy; 7218 if (Opcode == Instruction::Trunc) { 7219 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7220 VectorTy = 7221 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7222 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7223 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7224 VectorTy = 7225 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7226 } 7227 } 7228 7229 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7230 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7231 return N * 7232 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7233 } 7234 case Instruction::Call: { 7235 bool NeedToScalarize; 7236 CallInst *CI = cast<CallInst>(I); 7237 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7238 if (getVectorIntrinsicIDForCall(CI, TLI)) 7239 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7240 return CallCost; 7241 } 7242 case Instruction::ExtractValue: { 7243 InstructionCost ExtractCost = 7244 TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7245 assert(ExtractCost.isValid() && "Invalid cost for ExtractValue"); 7246 return *(ExtractCost.getValue()); 7247 } 7248 default: 7249 // The cost of executing VF copies of the scalar instruction. This opcode 7250 // is unknown. Assume that it is the same as 'mul'. 7251 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7252 Instruction::Mul, VectorTy, CostKind) + 7253 getScalarizationOverhead(I, VF); 7254 } // end of switch. 7255 } 7256 7257 char LoopVectorize::ID = 0; 7258 7259 static const char lv_name[] = "Loop Vectorization"; 7260 7261 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7262 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7263 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7264 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7265 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7266 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7267 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7268 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7269 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7270 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7271 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7272 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7273 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7274 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7275 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7276 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7277 7278 namespace llvm { 7279 7280 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7281 7282 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7283 bool VectorizeOnlyWhenForced) { 7284 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7285 } 7286 7287 } // end namespace llvm 7288 7289 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7290 // Check if the pointer operand of a load or store instruction is 7291 // consecutive. 7292 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7293 return Legal->isConsecutivePtr(Ptr); 7294 return false; 7295 } 7296 7297 void LoopVectorizationCostModel::collectValuesToIgnore() { 7298 // Ignore ephemeral values. 7299 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7300 7301 // Ignore type-promoting instructions we identified during reduction 7302 // detection. 7303 for (auto &Reduction : Legal->getReductionVars()) { 7304 RecurrenceDescriptor &RedDes = Reduction.second; 7305 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7306 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7307 } 7308 // Ignore type-casting instructions we identified during induction 7309 // detection. 7310 for (auto &Induction : Legal->getInductionVars()) { 7311 InductionDescriptor &IndDes = Induction.second; 7312 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7313 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7314 } 7315 } 7316 7317 void LoopVectorizationCostModel::collectInLoopReductions() { 7318 for (auto &Reduction : Legal->getReductionVars()) { 7319 PHINode *Phi = Reduction.first; 7320 RecurrenceDescriptor &RdxDesc = Reduction.second; 7321 7322 // We don't collect reductions that are type promoted (yet). 7323 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7324 continue; 7325 7326 // If the target would prefer this reduction to happen "in-loop", then we 7327 // want to record it as such. 7328 unsigned Opcode = RdxDesc.getRecurrenceBinOp(); 7329 if (!PreferInLoopReductions && 7330 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7331 TargetTransformInfo::ReductionFlags())) 7332 continue; 7333 7334 // Check that we can correctly put the reductions into the loop, by 7335 // finding the chain of operations that leads from the phi to the loop 7336 // exit value. 7337 SmallVector<Instruction *, 4> ReductionOperations = 7338 RdxDesc.getReductionOpChain(Phi, TheLoop); 7339 bool InLoop = !ReductionOperations.empty(); 7340 if (InLoop) 7341 InLoopReductionChains[Phi] = ReductionOperations; 7342 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7343 << " reduction for phi: " << *Phi << "\n"); 7344 } 7345 } 7346 7347 // TODO: we could return a pair of values that specify the max VF and 7348 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7349 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7350 // doesn't have a cost model that can choose which plan to execute if 7351 // more than one is generated. 7352 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7353 LoopVectorizationCostModel &CM) { 7354 unsigned WidestType; 7355 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7356 return WidestVectorRegBits / WidestType; 7357 } 7358 7359 VectorizationFactor 7360 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7361 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7362 ElementCount VF = UserVF; 7363 // Outer loop handling: They may require CFG and instruction level 7364 // transformations before even evaluating whether vectorization is profitable. 7365 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7366 // the vectorization pipeline. 7367 if (!OrigLoop->isInnermost()) { 7368 // If the user doesn't provide a vectorization factor, determine a 7369 // reasonable one. 7370 if (UserVF.isZero()) { 7371 VF = ElementCount::getFixed( 7372 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7373 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7374 7375 // Make sure we have a VF > 1 for stress testing. 7376 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7377 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7378 << "overriding computed VF.\n"); 7379 VF = ElementCount::getFixed(4); 7380 } 7381 } 7382 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7383 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7384 "VF needs to be a power of two"); 7385 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7386 << "VF " << VF << " to build VPlans.\n"); 7387 buildVPlans(VF, VF); 7388 7389 // For VPlan build stress testing, we bail out after VPlan construction. 7390 if (VPlanBuildStressTest) 7391 return VectorizationFactor::Disabled(); 7392 7393 return {VF, 0 /*Cost*/}; 7394 } 7395 7396 LLVM_DEBUG( 7397 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7398 "VPlan-native path.\n"); 7399 return VectorizationFactor::Disabled(); 7400 } 7401 7402 Optional<VectorizationFactor> 7403 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7404 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7405 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7406 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7407 return None; 7408 7409 // Invalidate interleave groups if all blocks of loop will be predicated. 7410 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7411 !useMaskedInterleavedAccesses(*TTI)) { 7412 LLVM_DEBUG( 7413 dbgs() 7414 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7415 "which requires masked-interleaved support.\n"); 7416 if (CM.InterleaveInfo.invalidateGroups()) 7417 // Invalidating interleave groups also requires invalidating all decisions 7418 // based on them, which includes widening decisions and uniform and scalar 7419 // values. 7420 CM.invalidateCostModelingDecisions(); 7421 } 7422 7423 ElementCount MaxVF = MaybeMaxVF.getValue(); 7424 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7425 7426 if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { 7427 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7428 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7429 "VF needs to be a power of two"); 7430 // Collect the instructions (and their associated costs) that will be more 7431 // profitable to scalarize. 7432 CM.selectUserVectorizationFactor(UserVF); 7433 CM.collectInLoopReductions(); 7434 buildVPlansWithVPRecipes(UserVF, UserVF); 7435 LLVM_DEBUG(printPlans(dbgs())); 7436 return {{UserVF, 0}}; 7437 } 7438 7439 assert(!MaxVF.isScalable() && 7440 "Scalable vectors not yet supported beyond this point"); 7441 7442 for (ElementCount VF = ElementCount::getFixed(1); 7443 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7444 // Collect Uniform and Scalar instructions after vectorization with VF. 7445 CM.collectUniformsAndScalars(VF); 7446 7447 // Collect the instructions (and their associated costs) that will be more 7448 // profitable to scalarize. 7449 if (VF.isVector()) 7450 CM.collectInstsToScalarize(VF); 7451 } 7452 7453 CM.collectInLoopReductions(); 7454 7455 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7456 LLVM_DEBUG(printPlans(dbgs())); 7457 if (MaxVF.isScalar()) 7458 return VectorizationFactor::Disabled(); 7459 7460 // Select the optimal vectorization factor. 7461 return CM.selectVectorizationFactor(MaxVF); 7462 } 7463 7464 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7465 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7466 << '\n'); 7467 BestVF = VF; 7468 BestUF = UF; 7469 7470 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7471 return !Plan->hasVF(VF); 7472 }); 7473 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7474 } 7475 7476 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7477 DominatorTree *DT) { 7478 // Perform the actual loop transformation. 7479 7480 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7481 VPCallbackILV CallbackILV(ILV); 7482 7483 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7484 7485 VPTransformState State{*BestVF, BestUF, LI, 7486 DT, ILV.Builder, ILV.VectorLoopValueMap, 7487 &ILV, CallbackILV}; 7488 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7489 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7490 State.CanonicalIV = ILV.Induction; 7491 7492 ILV.printDebugTracesAtStart(); 7493 7494 //===------------------------------------------------===// 7495 // 7496 // Notice: any optimization or new instruction that go 7497 // into the code below should also be implemented in 7498 // the cost-model. 7499 // 7500 //===------------------------------------------------===// 7501 7502 // 2. Copy and widen instructions from the old loop into the new loop. 7503 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7504 VPlans.front()->execute(&State); 7505 7506 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7507 // predication, updating analyses. 7508 ILV.fixVectorizedLoop(); 7509 7510 ILV.printDebugTracesAtEnd(); 7511 } 7512 7513 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7514 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7515 7516 // We create new control-flow for the vectorized loop, so the original exit 7517 // conditions will be dead after vectorization if it's only used by the 7518 // terminator 7519 SmallVector<BasicBlock*> ExitingBlocks; 7520 OrigLoop->getExitingBlocks(ExitingBlocks); 7521 for (auto *BB : ExitingBlocks) { 7522 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7523 if (!Cmp || !Cmp->hasOneUse()) 7524 continue; 7525 7526 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7527 if (!DeadInstructions.insert(Cmp).second) 7528 continue; 7529 7530 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7531 // TODO: can recurse through operands in general 7532 for (Value *Op : Cmp->operands()) { 7533 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7534 DeadInstructions.insert(cast<Instruction>(Op)); 7535 } 7536 } 7537 7538 // We create new "steps" for induction variable updates to which the original 7539 // induction variables map. An original update instruction will be dead if 7540 // all its users except the induction variable are dead. 7541 auto *Latch = OrigLoop->getLoopLatch(); 7542 for (auto &Induction : Legal->getInductionVars()) { 7543 PHINode *Ind = Induction.first; 7544 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7545 7546 // If the tail is to be folded by masking, the primary induction variable, 7547 // if exists, isn't dead: it will be used for masking. Don't kill it. 7548 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7549 continue; 7550 7551 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7552 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7553 })) 7554 DeadInstructions.insert(IndUpdate); 7555 7556 // We record as "Dead" also the type-casting instructions we had identified 7557 // during induction analysis. We don't need any handling for them in the 7558 // vectorized loop because we have proven that, under a proper runtime 7559 // test guarding the vectorized loop, the value of the phi, and the casted 7560 // value of the phi, are the same. The last instruction in this casting chain 7561 // will get its scalar/vector/widened def from the scalar/vector/widened def 7562 // of the respective phi node. Any other casts in the induction def-use chain 7563 // have no other uses outside the phi update chain, and will be ignored. 7564 InductionDescriptor &IndDes = Induction.second; 7565 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7566 DeadInstructions.insert(Casts.begin(), Casts.end()); 7567 } 7568 } 7569 7570 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7571 7572 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7573 7574 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7575 Instruction::BinaryOps BinOp) { 7576 // When unrolling and the VF is 1, we only need to add a simple scalar. 7577 Type *Ty = Val->getType(); 7578 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7579 7580 if (Ty->isFloatingPointTy()) { 7581 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7582 7583 // Floating point operations had to be 'fast' to enable the unrolling. 7584 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7585 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7586 } 7587 Constant *C = ConstantInt::get(Ty, StartIdx); 7588 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7589 } 7590 7591 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7592 SmallVector<Metadata *, 4> MDs; 7593 // Reserve first location for self reference to the LoopID metadata node. 7594 MDs.push_back(nullptr); 7595 bool IsUnrollMetadata = false; 7596 MDNode *LoopID = L->getLoopID(); 7597 if (LoopID) { 7598 // First find existing loop unrolling disable metadata. 7599 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7600 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7601 if (MD) { 7602 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7603 IsUnrollMetadata = 7604 S && S->getString().startswith("llvm.loop.unroll.disable"); 7605 } 7606 MDs.push_back(LoopID->getOperand(i)); 7607 } 7608 } 7609 7610 if (!IsUnrollMetadata) { 7611 // Add runtime unroll disable metadata. 7612 LLVMContext &Context = L->getHeader()->getContext(); 7613 SmallVector<Metadata *, 1> DisableOperands; 7614 DisableOperands.push_back( 7615 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7616 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7617 MDs.push_back(DisableNode); 7618 MDNode *NewLoopID = MDNode::get(Context, MDs); 7619 // Set operand 0 to refer to the loop id itself. 7620 NewLoopID->replaceOperandWith(0, NewLoopID); 7621 L->setLoopID(NewLoopID); 7622 } 7623 } 7624 7625 //===--------------------------------------------------------------------===// 7626 // EpilogueVectorizerMainLoop 7627 //===--------------------------------------------------------------------===// 7628 7629 /// This function is partially responsible for generating the control flow 7630 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7631 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7632 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7633 Loop *Lp = createVectorLoopSkeleton(""); 7634 7635 // Generate the code to check the minimum iteration count of the vector 7636 // epilogue (see below). 7637 EPI.EpilogueIterationCountCheck = 7638 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7639 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7640 7641 // Generate the code to check any assumptions that we've made for SCEV 7642 // expressions. 7643 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7644 emitSCEVChecks(Lp, LoopScalarPreHeader); 7645 7646 // If a safety check was generated save it. 7647 if (SavedPreHeader != LoopVectorPreHeader) 7648 EPI.SCEVSafetyCheck = SavedPreHeader; 7649 7650 // Generate the code that checks at runtime if arrays overlap. We put the 7651 // checks into a separate block to make the more common case of few elements 7652 // faster. 7653 SavedPreHeader = LoopVectorPreHeader; 7654 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7655 7656 // If a safety check was generated save/overwite it. 7657 if (SavedPreHeader != LoopVectorPreHeader) 7658 EPI.MemSafetyCheck = SavedPreHeader; 7659 7660 // Generate the iteration count check for the main loop, *after* the check 7661 // for the epilogue loop, so that the path-length is shorter for the case 7662 // that goes directly through the vector epilogue. The longer-path length for 7663 // the main loop is compensated for, by the gain from vectorizing the larger 7664 // trip count. Note: the branch will get updated later on when we vectorize 7665 // the epilogue. 7666 EPI.MainLoopIterationCountCheck = 7667 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7668 7669 // Generate the induction variable. 7670 OldInduction = Legal->getPrimaryInduction(); 7671 Type *IdxTy = Legal->getWidestInductionType(); 7672 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7673 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7674 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7675 EPI.VectorTripCount = CountRoundDown; 7676 Induction = 7677 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7678 getDebugLocFromInstOrOperands(OldInduction)); 7679 7680 // Skip induction resume value creation here because they will be created in 7681 // the second pass. If we created them here, they wouldn't be used anyway, 7682 // because the vplan in the second pass still contains the inductions from the 7683 // original loop. 7684 7685 return completeLoopSkeleton(Lp, OrigLoopID); 7686 } 7687 7688 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7689 LLVM_DEBUG({ 7690 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7691 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7692 << ", Main Loop UF:" << EPI.MainLoopUF 7693 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7694 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7695 }); 7696 } 7697 7698 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7699 DEBUG_WITH_TYPE(VerboseDebug, { 7700 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7701 }); 7702 } 7703 7704 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7705 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7706 assert(L && "Expected valid Loop."); 7707 assert(Bypass && "Expected valid bypass basic block."); 7708 unsigned VFactor = 7709 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7710 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7711 Value *Count = getOrCreateTripCount(L); 7712 // Reuse existing vector loop preheader for TC checks. 7713 // Note that new preheader block is generated for vector loop. 7714 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7715 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7716 7717 // Generate code to check if the loop's trip count is less than VF * UF of the 7718 // main vector loop. 7719 auto P = 7720 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7721 7722 Value *CheckMinIters = Builder.CreateICmp( 7723 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7724 "min.iters.check"); 7725 7726 if (!ForEpilogue) 7727 TCCheckBlock->setName("vector.main.loop.iter.check"); 7728 7729 // Create new preheader for vector loop. 7730 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7731 DT, LI, nullptr, "vector.ph"); 7732 7733 if (ForEpilogue) { 7734 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7735 DT->getNode(Bypass)->getIDom()) && 7736 "TC check is expected to dominate Bypass"); 7737 7738 // Update dominator for Bypass & LoopExit. 7739 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7740 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7741 7742 LoopBypassBlocks.push_back(TCCheckBlock); 7743 7744 // Save the trip count so we don't have to regenerate it in the 7745 // vec.epilog.iter.check. This is safe to do because the trip count 7746 // generated here dominates the vector epilog iter check. 7747 EPI.TripCount = Count; 7748 } 7749 7750 ReplaceInstWithInst( 7751 TCCheckBlock->getTerminator(), 7752 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7753 7754 return TCCheckBlock; 7755 } 7756 7757 //===--------------------------------------------------------------------===// 7758 // EpilogueVectorizerEpilogueLoop 7759 //===--------------------------------------------------------------------===// 7760 7761 /// This function is partially responsible for generating the control flow 7762 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7763 BasicBlock * 7764 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7765 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7766 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7767 7768 // Now, compare the remaining count and if there aren't enough iterations to 7769 // execute the vectorized epilogue skip to the scalar part. 7770 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7771 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7772 LoopVectorPreHeader = 7773 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7774 LI, nullptr, "vec.epilog.ph"); 7775 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7776 VecEpilogueIterationCountCheck); 7777 7778 // Adjust the control flow taking the state info from the main loop 7779 // vectorization into account. 7780 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7781 "expected this to be saved from the previous pass."); 7782 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7783 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7784 7785 DT->changeImmediateDominator(LoopVectorPreHeader, 7786 EPI.MainLoopIterationCountCheck); 7787 7788 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7789 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7790 7791 if (EPI.SCEVSafetyCheck) 7792 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7793 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7794 if (EPI.MemSafetyCheck) 7795 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7796 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7797 7798 DT->changeImmediateDominator( 7799 VecEpilogueIterationCountCheck, 7800 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7801 7802 DT->changeImmediateDominator(LoopScalarPreHeader, 7803 EPI.EpilogueIterationCountCheck); 7804 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7805 7806 // Keep track of bypass blocks, as they feed start values to the induction 7807 // phis in the scalar loop preheader. 7808 if (EPI.SCEVSafetyCheck) 7809 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7810 if (EPI.MemSafetyCheck) 7811 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7812 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7813 7814 // Generate a resume induction for the vector epilogue and put it in the 7815 // vector epilogue preheader 7816 Type *IdxTy = Legal->getWidestInductionType(); 7817 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7818 LoopVectorPreHeader->getFirstNonPHI()); 7819 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7820 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7821 EPI.MainLoopIterationCountCheck); 7822 7823 // Generate the induction variable. 7824 OldInduction = Legal->getPrimaryInduction(); 7825 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7826 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7827 Value *StartIdx = EPResumeVal; 7828 Induction = 7829 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7830 getDebugLocFromInstOrOperands(OldInduction)); 7831 7832 // Generate induction resume values. These variables save the new starting 7833 // indexes for the scalar loop. They are used to test if there are any tail 7834 // iterations left once the vector loop has completed. 7835 // Note that when the vectorized epilogue is skipped due to iteration count 7836 // check, then the resume value for the induction variable comes from 7837 // the trip count of the main vector loop, hence passing the AdditionalBypass 7838 // argument. 7839 createInductionResumeValues(Lp, CountRoundDown, 7840 {VecEpilogueIterationCountCheck, 7841 EPI.VectorTripCount} /* AdditionalBypass */); 7842 7843 AddRuntimeUnrollDisableMetaData(Lp); 7844 return completeLoopSkeleton(Lp, OrigLoopID); 7845 } 7846 7847 BasicBlock * 7848 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7849 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7850 7851 assert(EPI.TripCount && 7852 "Expected trip count to have been safed in the first pass."); 7853 assert( 7854 (!isa<Instruction>(EPI.TripCount) || 7855 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7856 "saved trip count does not dominate insertion point."); 7857 Value *TC = EPI.TripCount; 7858 IRBuilder<> Builder(Insert->getTerminator()); 7859 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7860 7861 // Generate code to check if the loop's trip count is less than VF * UF of the 7862 // vector epilogue loop. 7863 auto P = 7864 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7865 7866 Value *CheckMinIters = Builder.CreateICmp( 7867 P, Count, 7868 ConstantInt::get(Count->getType(), 7869 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7870 "min.epilog.iters.check"); 7871 7872 ReplaceInstWithInst( 7873 Insert->getTerminator(), 7874 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7875 7876 LoopBypassBlocks.push_back(Insert); 7877 return Insert; 7878 } 7879 7880 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7881 LLVM_DEBUG({ 7882 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7883 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7884 << ", Main Loop UF:" << EPI.MainLoopUF 7885 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7886 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7887 }); 7888 } 7889 7890 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7891 DEBUG_WITH_TYPE(VerboseDebug, { 7892 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7893 }); 7894 } 7895 7896 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7897 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7898 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7899 bool PredicateAtRangeStart = Predicate(Range.Start); 7900 7901 for (ElementCount TmpVF = Range.Start * 2; 7902 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7903 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7904 Range.End = TmpVF; 7905 break; 7906 } 7907 7908 return PredicateAtRangeStart; 7909 } 7910 7911 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7912 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7913 /// of VF's starting at a given VF and extending it as much as possible. Each 7914 /// vectorization decision can potentially shorten this sub-range during 7915 /// buildVPlan(). 7916 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7917 ElementCount MaxVF) { 7918 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7919 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7920 VFRange SubRange = {VF, MaxVFPlusOne}; 7921 VPlans.push_back(buildVPlan(SubRange)); 7922 VF = SubRange.End; 7923 } 7924 } 7925 7926 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7927 VPlanPtr &Plan) { 7928 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7929 7930 // Look for cached value. 7931 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7932 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7933 if (ECEntryIt != EdgeMaskCache.end()) 7934 return ECEntryIt->second; 7935 7936 VPValue *SrcMask = createBlockInMask(Src, Plan); 7937 7938 // The terminator has to be a branch inst! 7939 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7940 assert(BI && "Unexpected terminator found"); 7941 7942 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7943 return EdgeMaskCache[Edge] = SrcMask; 7944 7945 // If source is an exiting block, we know the exit edge is dynamically dead 7946 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7947 // adding uses of an otherwise potentially dead instruction. 7948 if (OrigLoop->isLoopExiting(Src)) 7949 return EdgeMaskCache[Edge] = SrcMask; 7950 7951 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7952 assert(EdgeMask && "No Edge Mask found for condition"); 7953 7954 if (BI->getSuccessor(0) != Dst) 7955 EdgeMask = Builder.createNot(EdgeMask); 7956 7957 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7958 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7959 7960 return EdgeMaskCache[Edge] = EdgeMask; 7961 } 7962 7963 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7964 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7965 7966 // Look for cached value. 7967 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7968 if (BCEntryIt != BlockMaskCache.end()) 7969 return BCEntryIt->second; 7970 7971 // All-one mask is modelled as no-mask following the convention for masked 7972 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7973 VPValue *BlockMask = nullptr; 7974 7975 if (OrigLoop->getHeader() == BB) { 7976 if (!CM.blockNeedsPredication(BB)) 7977 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7978 7979 // Create the block in mask as the first non-phi instruction in the block. 7980 VPBuilder::InsertPointGuard Guard(Builder); 7981 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7982 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7983 7984 // Introduce the early-exit compare IV <= BTC to form header block mask. 7985 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7986 // Start by constructing the desired canonical IV. 7987 VPValue *IV = nullptr; 7988 if (Legal->getPrimaryInduction()) 7989 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7990 else { 7991 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7992 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7993 IV = IVRecipe->getVPValue(); 7994 } 7995 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7996 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7997 7998 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7999 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8000 // as a second argument, we only pass the IV here and extract the 8001 // tripcount from the transform state where codegen of the VP instructions 8002 // happen. 8003 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8004 } else { 8005 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8006 } 8007 return BlockMaskCache[BB] = BlockMask; 8008 } 8009 8010 // This is the block mask. We OR all incoming edges. 8011 for (auto *Predecessor : predecessors(BB)) { 8012 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8013 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8014 return BlockMaskCache[BB] = EdgeMask; 8015 8016 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8017 BlockMask = EdgeMask; 8018 continue; 8019 } 8020 8021 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8022 } 8023 8024 return BlockMaskCache[BB] = BlockMask; 8025 } 8026 8027 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8028 VPlanPtr &Plan) { 8029 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8030 "Must be called with either a load or store"); 8031 8032 auto willWiden = [&](ElementCount VF) -> bool { 8033 if (VF.isScalar()) 8034 return false; 8035 LoopVectorizationCostModel::InstWidening Decision = 8036 CM.getWideningDecision(I, VF); 8037 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8038 "CM decision should be taken at this point."); 8039 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8040 return true; 8041 if (CM.isScalarAfterVectorization(I, VF) || 8042 CM.isProfitableToScalarize(I, VF)) 8043 return false; 8044 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8045 }; 8046 8047 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8048 return nullptr; 8049 8050 VPValue *Mask = nullptr; 8051 if (Legal->isMaskRequired(I)) 8052 Mask = createBlockInMask(I->getParent(), Plan); 8053 8054 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8055 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8056 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8057 8058 StoreInst *Store = cast<StoreInst>(I); 8059 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8060 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8061 } 8062 8063 VPWidenIntOrFpInductionRecipe * 8064 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 8065 // Check if this is an integer or fp induction. If so, build the recipe that 8066 // produces its scalar and vector values. 8067 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8068 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8069 II.getKind() == InductionDescriptor::IK_FpInduction) 8070 return new VPWidenIntOrFpInductionRecipe(Phi); 8071 8072 return nullptr; 8073 } 8074 8075 VPWidenIntOrFpInductionRecipe * 8076 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 8077 VFRange &Range) const { 8078 // Optimize the special case where the source is a constant integer 8079 // induction variable. Notice that we can only optimize the 'trunc' case 8080 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8081 // (c) other casts depend on pointer size. 8082 8083 // Determine whether \p K is a truncation based on an induction variable that 8084 // can be optimized. 8085 auto isOptimizableIVTruncate = 8086 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8087 return [=](ElementCount VF) -> bool { 8088 return CM.isOptimizableIVTruncate(K, VF); 8089 }; 8090 }; 8091 8092 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8093 isOptimizableIVTruncate(I), Range)) 8094 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8095 I); 8096 return nullptr; 8097 } 8098 8099 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8100 // We know that all PHIs in non-header blocks are converted into selects, so 8101 // we don't have to worry about the insertion order and we can just use the 8102 // builder. At this point we generate the predication tree. There may be 8103 // duplications since this is a simple recursive scan, but future 8104 // optimizations will clean it up. 8105 8106 SmallVector<VPValue *, 2> Operands; 8107 unsigned NumIncoming = Phi->getNumIncomingValues(); 8108 for (unsigned In = 0; In < NumIncoming; In++) { 8109 VPValue *EdgeMask = 8110 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8111 assert((EdgeMask || NumIncoming == 1) && 8112 "Multiple predecessors with one having a full mask"); 8113 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8114 if (EdgeMask) 8115 Operands.push_back(EdgeMask); 8116 } 8117 return new VPBlendRecipe(Phi, Operands); 8118 } 8119 8120 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8121 VPlan &Plan) const { 8122 8123 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8124 [this, CI](ElementCount VF) { 8125 return CM.isScalarWithPredication(CI, VF); 8126 }, 8127 Range); 8128 8129 if (IsPredicated) 8130 return nullptr; 8131 8132 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8133 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8134 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8135 ID == Intrinsic::pseudoprobe)) 8136 return nullptr; 8137 8138 auto willWiden = [&](ElementCount VF) -> bool { 8139 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8140 // The following case may be scalarized depending on the VF. 8141 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8142 // version of the instruction. 8143 // Is it beneficial to perform intrinsic call compared to lib call? 8144 bool NeedToScalarize = false; 8145 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8146 bool UseVectorIntrinsic = 8147 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8148 return UseVectorIntrinsic || !NeedToScalarize; 8149 }; 8150 8151 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8152 return nullptr; 8153 8154 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8155 } 8156 8157 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8158 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8159 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8160 // Instruction should be widened, unless it is scalar after vectorization, 8161 // scalarization is profitable or it is predicated. 8162 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8163 return CM.isScalarAfterVectorization(I, VF) || 8164 CM.isProfitableToScalarize(I, VF) || 8165 CM.isScalarWithPredication(I, VF); 8166 }; 8167 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8168 Range); 8169 } 8170 8171 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8172 auto IsVectorizableOpcode = [](unsigned Opcode) { 8173 switch (Opcode) { 8174 case Instruction::Add: 8175 case Instruction::And: 8176 case Instruction::AShr: 8177 case Instruction::BitCast: 8178 case Instruction::FAdd: 8179 case Instruction::FCmp: 8180 case Instruction::FDiv: 8181 case Instruction::FMul: 8182 case Instruction::FNeg: 8183 case Instruction::FPExt: 8184 case Instruction::FPToSI: 8185 case Instruction::FPToUI: 8186 case Instruction::FPTrunc: 8187 case Instruction::FRem: 8188 case Instruction::FSub: 8189 case Instruction::ICmp: 8190 case Instruction::IntToPtr: 8191 case Instruction::LShr: 8192 case Instruction::Mul: 8193 case Instruction::Or: 8194 case Instruction::PtrToInt: 8195 case Instruction::SDiv: 8196 case Instruction::Select: 8197 case Instruction::SExt: 8198 case Instruction::Shl: 8199 case Instruction::SIToFP: 8200 case Instruction::SRem: 8201 case Instruction::Sub: 8202 case Instruction::Trunc: 8203 case Instruction::UDiv: 8204 case Instruction::UIToFP: 8205 case Instruction::URem: 8206 case Instruction::Xor: 8207 case Instruction::ZExt: 8208 return true; 8209 } 8210 return false; 8211 }; 8212 8213 if (!IsVectorizableOpcode(I->getOpcode())) 8214 return nullptr; 8215 8216 // Success: widen this instruction. 8217 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8218 } 8219 8220 VPBasicBlock *VPRecipeBuilder::handleReplication( 8221 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8222 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8223 VPlanPtr &Plan) { 8224 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8225 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8226 Range); 8227 8228 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8229 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8230 Range); 8231 8232 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8233 IsUniform, IsPredicated); 8234 setRecipe(I, Recipe); 8235 Plan->addVPValue(I, Recipe); 8236 8237 // Find if I uses a predicated instruction. If so, it will use its scalar 8238 // value. Avoid hoisting the insert-element which packs the scalar value into 8239 // a vector value, as that happens iff all users use the vector value. 8240 for (auto &Op : I->operands()) 8241 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8242 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8243 PredInst2Recipe[PredInst]->setAlsoPack(false); 8244 8245 // Finalize the recipe for Instr, first if it is not predicated. 8246 if (!IsPredicated) { 8247 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8248 VPBB->appendRecipe(Recipe); 8249 return VPBB; 8250 } 8251 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8252 assert(VPBB->getSuccessors().empty() && 8253 "VPBB has successors when handling predicated replication."); 8254 // Record predicated instructions for above packing optimizations. 8255 PredInst2Recipe[I] = Recipe; 8256 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8257 VPBlockUtils::insertBlockAfter(Region, VPBB); 8258 auto *RegSucc = new VPBasicBlock(); 8259 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8260 return RegSucc; 8261 } 8262 8263 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8264 VPRecipeBase *PredRecipe, 8265 VPlanPtr &Plan) { 8266 // Instructions marked for predication are replicated and placed under an 8267 // if-then construct to prevent side-effects. 8268 8269 // Generate recipes to compute the block mask for this region. 8270 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8271 8272 // Build the triangular if-then region. 8273 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8274 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8275 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8276 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8277 auto *PHIRecipe = Instr->getType()->isVoidTy() 8278 ? nullptr 8279 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8280 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8281 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8282 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8283 8284 // Note: first set Entry as region entry and then connect successors starting 8285 // from it in order, to propagate the "parent" of each VPBasicBlock. 8286 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8287 VPBlockUtils::connectBlocks(Pred, Exit); 8288 8289 return Region; 8290 } 8291 8292 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8293 VFRange &Range, 8294 VPlanPtr &Plan) { 8295 // First, check for specific widening recipes that deal with calls, memory 8296 // operations, inductions and Phi nodes. 8297 if (auto *CI = dyn_cast<CallInst>(Instr)) 8298 return tryToWidenCall(CI, Range, *Plan); 8299 8300 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8301 return tryToWidenMemory(Instr, Range, Plan); 8302 8303 VPRecipeBase *Recipe; 8304 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8305 if (Phi->getParent() != OrigLoop->getHeader()) 8306 return tryToBlend(Phi, Plan); 8307 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 8308 return Recipe; 8309 return new VPWidenPHIRecipe(Phi); 8310 } 8311 8312 if (isa<TruncInst>(Instr) && 8313 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 8314 return Recipe; 8315 8316 if (!shouldWiden(Instr, Range)) 8317 return nullptr; 8318 8319 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8320 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8321 OrigLoop); 8322 8323 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8324 bool InvariantCond = 8325 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8326 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8327 InvariantCond); 8328 } 8329 8330 return tryToWiden(Instr, *Plan); 8331 } 8332 8333 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8334 ElementCount MaxVF) { 8335 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8336 8337 // Collect instructions from the original loop that will become trivially dead 8338 // in the vectorized loop. We don't need to vectorize these instructions. For 8339 // example, original induction update instructions can become dead because we 8340 // separately emit induction "steps" when generating code for the new loop. 8341 // Similarly, we create a new latch condition when setting up the structure 8342 // of the new loop, so the old one can become dead. 8343 SmallPtrSet<Instruction *, 4> DeadInstructions; 8344 collectTriviallyDeadInstructions(DeadInstructions); 8345 8346 // Add assume instructions we need to drop to DeadInstructions, to prevent 8347 // them from being added to the VPlan. 8348 // TODO: We only need to drop assumes in blocks that get flattend. If the 8349 // control flow is preserved, we should keep them. 8350 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8351 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8352 8353 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8354 // Dead instructions do not need sinking. Remove them from SinkAfter. 8355 for (Instruction *I : DeadInstructions) 8356 SinkAfter.erase(I); 8357 8358 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8359 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8360 VFRange SubRange = {VF, MaxVFPlusOne}; 8361 VPlans.push_back( 8362 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8363 VF = SubRange.End; 8364 } 8365 } 8366 8367 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8368 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8369 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8370 8371 // Hold a mapping from predicated instructions to their recipes, in order to 8372 // fix their AlsoPack behavior if a user is determined to replicate and use a 8373 // scalar instead of vector value. 8374 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8375 8376 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8377 8378 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8379 8380 // --------------------------------------------------------------------------- 8381 // Pre-construction: record ingredients whose recipes we'll need to further 8382 // process after constructing the initial VPlan. 8383 // --------------------------------------------------------------------------- 8384 8385 // Mark instructions we'll need to sink later and their targets as 8386 // ingredients whose recipe we'll need to record. 8387 for (auto &Entry : SinkAfter) { 8388 RecipeBuilder.recordRecipeOf(Entry.first); 8389 RecipeBuilder.recordRecipeOf(Entry.second); 8390 } 8391 for (auto &Reduction : CM.getInLoopReductionChains()) { 8392 PHINode *Phi = Reduction.first; 8393 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8394 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8395 8396 RecipeBuilder.recordRecipeOf(Phi); 8397 for (auto &R : ReductionOperations) { 8398 RecipeBuilder.recordRecipeOf(R); 8399 // For min/max reducitons, where we have a pair of icmp/select, we also 8400 // need to record the ICmp recipe, so it can be removed later. 8401 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8402 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8403 } 8404 } 8405 8406 // For each interleave group which is relevant for this (possibly trimmed) 8407 // Range, add it to the set of groups to be later applied to the VPlan and add 8408 // placeholders for its members' Recipes which we'll be replacing with a 8409 // single VPInterleaveRecipe. 8410 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8411 auto applyIG = [IG, this](ElementCount VF) -> bool { 8412 return (VF.isVector() && // Query is illegal for VF == 1 8413 CM.getWideningDecision(IG->getInsertPos(), VF) == 8414 LoopVectorizationCostModel::CM_Interleave); 8415 }; 8416 if (!getDecisionAndClampRange(applyIG, Range)) 8417 continue; 8418 InterleaveGroups.insert(IG); 8419 for (unsigned i = 0; i < IG->getFactor(); i++) 8420 if (Instruction *Member = IG->getMember(i)) 8421 RecipeBuilder.recordRecipeOf(Member); 8422 }; 8423 8424 // --------------------------------------------------------------------------- 8425 // Build initial VPlan: Scan the body of the loop in a topological order to 8426 // visit each basic block after having visited its predecessor basic blocks. 8427 // --------------------------------------------------------------------------- 8428 8429 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8430 auto Plan = std::make_unique<VPlan>(); 8431 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8432 Plan->setEntry(VPBB); 8433 8434 // Scan the body of the loop in a topological order to visit each basic block 8435 // after having visited its predecessor basic blocks. 8436 LoopBlocksDFS DFS(OrigLoop); 8437 DFS.perform(LI); 8438 8439 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8440 // Relevant instructions from basic block BB will be grouped into VPRecipe 8441 // ingredients and fill a new VPBasicBlock. 8442 unsigned VPBBsForBB = 0; 8443 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8444 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8445 VPBB = FirstVPBBForBB; 8446 Builder.setInsertPoint(VPBB); 8447 8448 // Introduce each ingredient into VPlan. 8449 // TODO: Model and preserve debug instrinsics in VPlan. 8450 for (Instruction &I : BB->instructionsWithoutDebug()) { 8451 Instruction *Instr = &I; 8452 8453 // First filter out irrelevant instructions, to ensure no recipes are 8454 // built for them. 8455 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8456 continue; 8457 8458 if (auto Recipe = 8459 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8460 for (auto *Def : Recipe->definedValues()) { 8461 auto *UV = Def->getUnderlyingValue(); 8462 Plan->addVPValue(UV, Def); 8463 } 8464 8465 RecipeBuilder.setRecipe(Instr, Recipe); 8466 VPBB->appendRecipe(Recipe); 8467 continue; 8468 } 8469 8470 // Otherwise, if all widening options failed, Instruction is to be 8471 // replicated. This may create a successor for VPBB. 8472 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8473 Instr, Range, VPBB, PredInst2Recipe, Plan); 8474 if (NextVPBB != VPBB) { 8475 VPBB = NextVPBB; 8476 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8477 : ""); 8478 } 8479 } 8480 } 8481 8482 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8483 // may also be empty, such as the last one VPBB, reflecting original 8484 // basic-blocks with no recipes. 8485 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8486 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8487 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8488 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8489 delete PreEntry; 8490 8491 // --------------------------------------------------------------------------- 8492 // Transform initial VPlan: Apply previously taken decisions, in order, to 8493 // bring the VPlan to its final state. 8494 // --------------------------------------------------------------------------- 8495 8496 // Apply Sink-After legal constraints. 8497 for (auto &Entry : SinkAfter) { 8498 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8499 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8500 Sink->moveAfter(Target); 8501 } 8502 8503 // Interleave memory: for each Interleave Group we marked earlier as relevant 8504 // for this VPlan, replace the Recipes widening its memory instructions with a 8505 // single VPInterleaveRecipe at its insertion point. 8506 for (auto IG : InterleaveGroups) { 8507 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8508 RecipeBuilder.getRecipe(IG->getInsertPos())); 8509 SmallVector<VPValue *, 4> StoredValues; 8510 for (unsigned i = 0; i < IG->getFactor(); ++i) 8511 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8512 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8513 8514 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8515 Recipe->getMask()); 8516 VPIG->insertBefore(Recipe); 8517 unsigned J = 0; 8518 for (unsigned i = 0; i < IG->getFactor(); ++i) 8519 if (Instruction *Member = IG->getMember(i)) { 8520 if (!Member->getType()->isVoidTy()) { 8521 VPValue *OriginalV = Plan->getVPValue(Member); 8522 Plan->removeVPValueFor(Member); 8523 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8524 J++; 8525 } 8526 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8527 } 8528 } 8529 8530 // Adjust the recipes for any inloop reductions. 8531 if (Range.Start.isVector()) 8532 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8533 8534 // Finally, if tail is folded by masking, introduce selects between the phi 8535 // and the live-out instruction of each reduction, at the end of the latch. 8536 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8537 Builder.setInsertPoint(VPBB); 8538 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8539 for (auto &Reduction : Legal->getReductionVars()) { 8540 if (CM.isInLoopReduction(Reduction.first)) 8541 continue; 8542 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8543 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8544 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8545 } 8546 } 8547 8548 std::string PlanName; 8549 raw_string_ostream RSO(PlanName); 8550 ElementCount VF = Range.Start; 8551 Plan->addVF(VF); 8552 RSO << "Initial VPlan for VF={" << VF; 8553 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8554 Plan->addVF(VF); 8555 RSO << "," << VF; 8556 } 8557 RSO << "},UF>=1"; 8558 RSO.flush(); 8559 Plan->setName(PlanName); 8560 8561 return Plan; 8562 } 8563 8564 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8565 // Outer loop handling: They may require CFG and instruction level 8566 // transformations before even evaluating whether vectorization is profitable. 8567 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8568 // the vectorization pipeline. 8569 assert(!OrigLoop->isInnermost()); 8570 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8571 8572 // Create new empty VPlan 8573 auto Plan = std::make_unique<VPlan>(); 8574 8575 // Build hierarchical CFG 8576 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8577 HCFGBuilder.buildHierarchicalCFG(); 8578 8579 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8580 VF *= 2) 8581 Plan->addVF(VF); 8582 8583 if (EnableVPlanPredication) { 8584 VPlanPredicator VPP(*Plan); 8585 VPP.predicate(); 8586 8587 // Avoid running transformation to recipes until masked code generation in 8588 // VPlan-native path is in place. 8589 return Plan; 8590 } 8591 8592 SmallPtrSet<Instruction *, 1> DeadInstructions; 8593 VPlanTransforms::VPInstructionsToVPRecipes( 8594 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8595 return Plan; 8596 } 8597 8598 // Adjust the recipes for any inloop reductions. The chain of instructions 8599 // leading from the loop exit instr to the phi need to be converted to 8600 // reductions, with one operand being vector and the other being the scalar 8601 // reduction chain. 8602 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8603 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8604 for (auto &Reduction : CM.getInLoopReductionChains()) { 8605 PHINode *Phi = Reduction.first; 8606 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8607 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8608 8609 // ReductionOperations are orders top-down from the phi's use to the 8610 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8611 // which of the two operands will remain scalar and which will be reduced. 8612 // For minmax the chain will be the select instructions. 8613 Instruction *Chain = Phi; 8614 for (Instruction *R : ReductionOperations) { 8615 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8616 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8617 8618 VPValue *ChainOp = Plan->getVPValue(Chain); 8619 unsigned FirstOpId; 8620 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8621 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8622 "Expected to replace a VPWidenSelectSC"); 8623 FirstOpId = 1; 8624 } else { 8625 assert(isa<VPWidenRecipe>(WidenRecipe) && 8626 "Expected to replace a VPWidenSC"); 8627 FirstOpId = 0; 8628 } 8629 unsigned VecOpId = 8630 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8631 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8632 8633 auto *CondOp = CM.foldTailByMasking() 8634 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8635 : nullptr; 8636 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8637 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8638 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8639 Plan->removeVPValueFor(R); 8640 Plan->addVPValue(R, RedRecipe); 8641 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8642 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8643 WidenRecipe->eraseFromParent(); 8644 8645 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8646 VPRecipeBase *CompareRecipe = 8647 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8648 assert(isa<VPWidenRecipe>(CompareRecipe) && 8649 "Expected to replace a VPWidenSC"); 8650 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8651 "Expected no remaining users"); 8652 CompareRecipe->eraseFromParent(); 8653 } 8654 Chain = R; 8655 } 8656 } 8657 } 8658 8659 Value* LoopVectorizationPlanner::VPCallbackILV:: 8660 getOrCreateVectorValues(Value *V, unsigned Part) { 8661 return ILV.getOrCreateVectorValue(V, Part); 8662 } 8663 8664 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8665 Value *V, const VPIteration &Instance) { 8666 return ILV.getOrCreateScalarValue(V, Instance); 8667 } 8668 8669 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8670 VPSlotTracker &SlotTracker) const { 8671 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8672 IG->getInsertPos()->printAsOperand(O, false); 8673 O << ", "; 8674 getAddr()->printAsOperand(O, SlotTracker); 8675 VPValue *Mask = getMask(); 8676 if (Mask) { 8677 O << ", "; 8678 Mask->printAsOperand(O, SlotTracker); 8679 } 8680 for (unsigned i = 0; i < IG->getFactor(); ++i) 8681 if (Instruction *I = IG->getMember(i)) 8682 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8683 } 8684 8685 void VPWidenCallRecipe::execute(VPTransformState &State) { 8686 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8687 *this, State); 8688 } 8689 8690 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8691 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8692 this, *this, InvariantCond, State); 8693 } 8694 8695 void VPWidenRecipe::execute(VPTransformState &State) { 8696 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8697 } 8698 8699 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8700 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8701 *this, State.UF, State.VF, IsPtrLoopInvariant, 8702 IsIndexLoopInvariant, State); 8703 } 8704 8705 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8706 assert(!State.Instance && "Int or FP induction being replicated."); 8707 State.ILV->widenIntOrFpInduction(IV, Trunc); 8708 } 8709 8710 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8711 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8712 } 8713 8714 void VPBlendRecipe::execute(VPTransformState &State) { 8715 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8716 // We know that all PHIs in non-header blocks are converted into 8717 // selects, so we don't have to worry about the insertion order and we 8718 // can just use the builder. 8719 // At this point we generate the predication tree. There may be 8720 // duplications since this is a simple recursive scan, but future 8721 // optimizations will clean it up. 8722 8723 unsigned NumIncoming = getNumIncomingValues(); 8724 8725 // Generate a sequence of selects of the form: 8726 // SELECT(Mask3, In3, 8727 // SELECT(Mask2, In2, 8728 // SELECT(Mask1, In1, 8729 // In0))) 8730 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8731 // are essentially undef are taken from In0. 8732 InnerLoopVectorizer::VectorParts Entry(State.UF); 8733 for (unsigned In = 0; In < NumIncoming; ++In) { 8734 for (unsigned Part = 0; Part < State.UF; ++Part) { 8735 // We might have single edge PHIs (blocks) - use an identity 8736 // 'select' for the first PHI operand. 8737 Value *In0 = State.get(getIncomingValue(In), Part); 8738 if (In == 0) 8739 Entry[Part] = In0; // Initialize with the first incoming value. 8740 else { 8741 // Select between the current value and the previous incoming edge 8742 // based on the incoming mask. 8743 Value *Cond = State.get(getMask(In), Part); 8744 Entry[Part] = 8745 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8746 } 8747 } 8748 } 8749 for (unsigned Part = 0; Part < State.UF; ++Part) 8750 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8751 } 8752 8753 void VPInterleaveRecipe::execute(VPTransformState &State) { 8754 assert(!State.Instance && "Interleave group being replicated."); 8755 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 8756 getStoredValues(), getMask()); 8757 } 8758 8759 void VPReductionRecipe::execute(VPTransformState &State) { 8760 assert(!State.Instance && "Reduction being replicated."); 8761 for (unsigned Part = 0; Part < State.UF; ++Part) { 8762 RecurKind Kind = RdxDesc->getRecurrenceKind(); 8763 Value *NewVecOp = State.get(getVecOp(), Part); 8764 if (VPValue *Cond = getCondOp()) { 8765 Value *NewCond = State.get(Cond, Part); 8766 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8767 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8768 Kind, VecTy->getElementType()); 8769 Constant *IdenVec = 8770 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8771 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8772 NewVecOp = Select; 8773 } 8774 Value *NewRed = 8775 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 8776 Value *PrevInChain = State.get(getChainOp(), Part); 8777 Value *NextInChain; 8778 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8779 NextInChain = 8780 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 8781 NewRed, PrevInChain); 8782 } else { 8783 NextInChain = State.Builder.CreateBinOp( 8784 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8785 PrevInChain); 8786 } 8787 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8788 } 8789 } 8790 8791 void VPReplicateRecipe::execute(VPTransformState &State) { 8792 if (State.Instance) { // Generate a single instance. 8793 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8794 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8795 *State.Instance, IsPredicated, State); 8796 // Insert scalar instance packing it into a vector. 8797 if (AlsoPack && State.VF.isVector()) { 8798 // If we're constructing lane 0, initialize to start from undef. 8799 if (State.Instance->Lane == 0) { 8800 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8801 Value *Undef = UndefValue::get( 8802 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8803 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8804 State.Instance->Part, Undef); 8805 } 8806 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8807 *State.Instance); 8808 } 8809 return; 8810 } 8811 8812 // Generate scalar instances for all VF lanes of all UF parts, unless the 8813 // instruction is uniform inwhich case generate only the first lane for each 8814 // of the UF parts. 8815 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8816 assert((!State.VF.isScalable() || IsUniform) && 8817 "Can't scalarize a scalable vector"); 8818 for (unsigned Part = 0; Part < State.UF; ++Part) 8819 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8820 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8821 IsPredicated, State); 8822 } 8823 8824 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8825 assert(State.Instance && "Branch on Mask works only on single instance."); 8826 8827 unsigned Part = State.Instance->Part; 8828 unsigned Lane = State.Instance->Lane; 8829 8830 Value *ConditionBit = nullptr; 8831 VPValue *BlockInMask = getMask(); 8832 if (BlockInMask) { 8833 ConditionBit = State.get(BlockInMask, Part); 8834 if (ConditionBit->getType()->isVectorTy()) 8835 ConditionBit = State.Builder.CreateExtractElement( 8836 ConditionBit, State.Builder.getInt32(Lane)); 8837 } else // Block in mask is all-one. 8838 ConditionBit = State.Builder.getTrue(); 8839 8840 // Replace the temporary unreachable terminator with a new conditional branch, 8841 // whose two destinations will be set later when they are created. 8842 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8843 assert(isa<UnreachableInst>(CurrentTerminator) && 8844 "Expected to replace unreachable terminator with conditional branch."); 8845 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8846 CondBr->setSuccessor(0, nullptr); 8847 ReplaceInstWithInst(CurrentTerminator, CondBr); 8848 } 8849 8850 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8851 assert(State.Instance && "Predicated instruction PHI works per instance."); 8852 Instruction *ScalarPredInst = 8853 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8854 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8855 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8856 assert(PredicatingBB && "Predicated block has no single predecessor."); 8857 8858 // By current pack/unpack logic we need to generate only a single phi node: if 8859 // a vector value for the predicated instruction exists at this point it means 8860 // the instruction has vector users only, and a phi for the vector value is 8861 // needed. In this case the recipe of the predicated instruction is marked to 8862 // also do that packing, thereby "hoisting" the insert-element sequence. 8863 // Otherwise, a phi node for the scalar value is needed. 8864 unsigned Part = State.Instance->Part; 8865 Instruction *PredInst = 8866 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8867 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8868 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8869 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8870 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8871 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8872 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8873 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8874 } else { 8875 Type *PredInstType = PredInst->getType(); 8876 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8877 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8878 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8879 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8880 } 8881 } 8882 8883 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8884 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8885 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8886 StoredValue ? nullptr : getVPValue(), 8887 getAddr(), StoredValue, getMask()); 8888 } 8889 8890 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8891 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8892 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8893 // for predication. 8894 static ScalarEpilogueLowering getScalarEpilogueLowering( 8895 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8896 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8897 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8898 LoopVectorizationLegality &LVL) { 8899 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8900 // don't look at hints or options, and don't request a scalar epilogue. 8901 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8902 // LoopAccessInfo (due to code dependency and not being able to reliably get 8903 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8904 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8905 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8906 // back to the old way and vectorize with versioning when forced. See D81345.) 8907 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8908 PGSOQueryType::IRPass) && 8909 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8910 return CM_ScalarEpilogueNotAllowedOptSize; 8911 8912 // 2) If set, obey the directives 8913 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 8914 switch (PreferPredicateOverEpilogue) { 8915 case PreferPredicateTy::ScalarEpilogue: 8916 return CM_ScalarEpilogueAllowed; 8917 case PreferPredicateTy::PredicateElseScalarEpilogue: 8918 return CM_ScalarEpilogueNotNeededUsePredicate; 8919 case PreferPredicateTy::PredicateOrDontVectorize: 8920 return CM_ScalarEpilogueNotAllowedUsePredicate; 8921 }; 8922 } 8923 8924 // 3) If set, obey the hints 8925 switch (Hints.getPredicate()) { 8926 case LoopVectorizeHints::FK_Enabled: 8927 return CM_ScalarEpilogueNotNeededUsePredicate; 8928 case LoopVectorizeHints::FK_Disabled: 8929 return CM_ScalarEpilogueAllowed; 8930 }; 8931 8932 // 4) if the TTI hook indicates this is profitable, request predication. 8933 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8934 LVL.getLAI())) 8935 return CM_ScalarEpilogueNotNeededUsePredicate; 8936 8937 return CM_ScalarEpilogueAllowed; 8938 } 8939 8940 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8941 unsigned Part) { 8942 set(Def, V, Part); 8943 ILV->setVectorValue(IRDef, Part, V); 8944 } 8945 8946 // Process the loop in the VPlan-native vectorization path. This path builds 8947 // VPlan upfront in the vectorization pipeline, which allows to apply 8948 // VPlan-to-VPlan transformations from the very beginning without modifying the 8949 // input LLVM IR. 8950 static bool processLoopInVPlanNativePath( 8951 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8952 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8953 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8954 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8955 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8956 8957 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8958 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8959 return false; 8960 } 8961 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8962 Function *F = L->getHeader()->getParent(); 8963 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8964 8965 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8966 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8967 8968 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8969 &Hints, IAI); 8970 // Use the planner for outer loop vectorization. 8971 // TODO: CM is not used at this point inside the planner. Turn CM into an 8972 // optional argument if we don't need it in the future. 8973 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8974 8975 // Get user vectorization factor. 8976 ElementCount UserVF = Hints.getWidth(); 8977 8978 // Plan how to best vectorize, return the best VF and its cost. 8979 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8980 8981 // If we are stress testing VPlan builds, do not attempt to generate vector 8982 // code. Masked vector code generation support will follow soon. 8983 // Also, do not attempt to vectorize if no vector code will be produced. 8984 if (VPlanBuildStressTest || EnableVPlanPredication || 8985 VectorizationFactor::Disabled() == VF) 8986 return false; 8987 8988 LVP.setBestPlan(VF.Width, 1); 8989 8990 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8991 &CM, BFI, PSI); 8992 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8993 << L->getHeader()->getParent()->getName() << "\"\n"); 8994 LVP.executePlan(LB, DT); 8995 8996 // Mark the loop as already vectorized to avoid vectorizing again. 8997 Hints.setAlreadyVectorized(); 8998 8999 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9000 return true; 9001 } 9002 9003 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9004 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9005 !EnableLoopInterleaving), 9006 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9007 !EnableLoopVectorization) {} 9008 9009 bool LoopVectorizePass::processLoop(Loop *L) { 9010 assert((EnableVPlanNativePath || L->isInnermost()) && 9011 "VPlan-native path is not enabled. Only process inner loops."); 9012 9013 #ifndef NDEBUG 9014 const std::string DebugLocStr = getDebugLocString(L); 9015 #endif /* NDEBUG */ 9016 9017 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9018 << L->getHeader()->getParent()->getName() << "\" from " 9019 << DebugLocStr << "\n"); 9020 9021 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9022 9023 LLVM_DEBUG( 9024 dbgs() << "LV: Loop hints:" 9025 << " force=" 9026 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9027 ? "disabled" 9028 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9029 ? "enabled" 9030 : "?")) 9031 << " width=" << Hints.getWidth() 9032 << " unroll=" << Hints.getInterleave() << "\n"); 9033 9034 // Function containing loop 9035 Function *F = L->getHeader()->getParent(); 9036 9037 // Looking at the diagnostic output is the only way to determine if a loop 9038 // was vectorized (other than looking at the IR or machine code), so it 9039 // is important to generate an optimization remark for each loop. Most of 9040 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9041 // generated as OptimizationRemark and OptimizationRemarkMissed are 9042 // less verbose reporting vectorized loops and unvectorized loops that may 9043 // benefit from vectorization, respectively. 9044 9045 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9046 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9047 return false; 9048 } 9049 9050 PredicatedScalarEvolution PSE(*SE, *L); 9051 9052 // Check if it is legal to vectorize the loop. 9053 LoopVectorizationRequirements Requirements(*ORE); 9054 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9055 &Requirements, &Hints, DB, AC, BFI, PSI); 9056 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9057 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9058 Hints.emitRemarkWithHints(); 9059 return false; 9060 } 9061 9062 // Check the function attributes and profiles to find out if this function 9063 // should be optimized for size. 9064 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9065 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9066 9067 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9068 // here. They may require CFG and instruction level transformations before 9069 // even evaluating whether vectorization is profitable. Since we cannot modify 9070 // the incoming IR, we need to build VPlan upfront in the vectorization 9071 // pipeline. 9072 if (!L->isInnermost()) 9073 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9074 ORE, BFI, PSI, Hints); 9075 9076 assert(L->isInnermost() && "Inner loop expected."); 9077 9078 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9079 // count by optimizing for size, to minimize overheads. 9080 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9081 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9082 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9083 << "This loop is worth vectorizing only if no scalar " 9084 << "iteration overheads are incurred."); 9085 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9086 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9087 else { 9088 LLVM_DEBUG(dbgs() << "\n"); 9089 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9090 } 9091 } 9092 9093 // Check the function attributes to see if implicit floats are allowed. 9094 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9095 // an integer loop and the vector instructions selected are purely integer 9096 // vector instructions? 9097 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9098 reportVectorizationFailure( 9099 "Can't vectorize when the NoImplicitFloat attribute is used", 9100 "loop not vectorized due to NoImplicitFloat attribute", 9101 "NoImplicitFloat", ORE, L); 9102 Hints.emitRemarkWithHints(); 9103 return false; 9104 } 9105 9106 // Check if the target supports potentially unsafe FP vectorization. 9107 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9108 // for the target we're vectorizing for, to make sure none of the 9109 // additional fp-math flags can help. 9110 if (Hints.isPotentiallyUnsafe() && 9111 TTI->isFPVectorizationPotentiallyUnsafe()) { 9112 reportVectorizationFailure( 9113 "Potentially unsafe FP op prevents vectorization", 9114 "loop not vectorized due to unsafe FP support.", 9115 "UnsafeFP", ORE, L); 9116 Hints.emitRemarkWithHints(); 9117 return false; 9118 } 9119 9120 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9121 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9122 9123 // If an override option has been passed in for interleaved accesses, use it. 9124 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9125 UseInterleaved = EnableInterleavedMemAccesses; 9126 9127 // Analyze interleaved memory accesses. 9128 if (UseInterleaved) { 9129 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9130 } 9131 9132 // Use the cost model. 9133 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9134 F, &Hints, IAI); 9135 CM.collectValuesToIgnore(); 9136 9137 // Use the planner for vectorization. 9138 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9139 9140 // Get user vectorization factor and interleave count. 9141 ElementCount UserVF = Hints.getWidth(); 9142 unsigned UserIC = Hints.getInterleave(); 9143 9144 // Plan how to best vectorize, return the best VF and its cost. 9145 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9146 9147 VectorizationFactor VF = VectorizationFactor::Disabled(); 9148 unsigned IC = 1; 9149 9150 if (MaybeVF) { 9151 VF = *MaybeVF; 9152 // Select the interleave count. 9153 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9154 } 9155 9156 // Identify the diagnostic messages that should be produced. 9157 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9158 bool VectorizeLoop = true, InterleaveLoop = true; 9159 if (Requirements.doesNotMeet(F, L, Hints)) { 9160 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9161 "requirements.\n"); 9162 Hints.emitRemarkWithHints(); 9163 return false; 9164 } 9165 9166 if (VF.Width.isScalar()) { 9167 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9168 VecDiagMsg = std::make_pair( 9169 "VectorizationNotBeneficial", 9170 "the cost-model indicates that vectorization is not beneficial"); 9171 VectorizeLoop = false; 9172 } 9173 9174 if (!MaybeVF && UserIC > 1) { 9175 // Tell the user interleaving was avoided up-front, despite being explicitly 9176 // requested. 9177 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9178 "interleaving should be avoided up front\n"); 9179 IntDiagMsg = std::make_pair( 9180 "InterleavingAvoided", 9181 "Ignoring UserIC, because interleaving was avoided up front"); 9182 InterleaveLoop = false; 9183 } else if (IC == 1 && UserIC <= 1) { 9184 // Tell the user interleaving is not beneficial. 9185 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9186 IntDiagMsg = std::make_pair( 9187 "InterleavingNotBeneficial", 9188 "the cost-model indicates that interleaving is not beneficial"); 9189 InterleaveLoop = false; 9190 if (UserIC == 1) { 9191 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9192 IntDiagMsg.second += 9193 " and is explicitly disabled or interleave count is set to 1"; 9194 } 9195 } else if (IC > 1 && UserIC == 1) { 9196 // Tell the user interleaving is beneficial, but it explicitly disabled. 9197 LLVM_DEBUG( 9198 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9199 IntDiagMsg = std::make_pair( 9200 "InterleavingBeneficialButDisabled", 9201 "the cost-model indicates that interleaving is beneficial " 9202 "but is explicitly disabled or interleave count is set to 1"); 9203 InterleaveLoop = false; 9204 } 9205 9206 // Override IC if user provided an interleave count. 9207 IC = UserIC > 0 ? UserIC : IC; 9208 9209 // Emit diagnostic messages, if any. 9210 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9211 if (!VectorizeLoop && !InterleaveLoop) { 9212 // Do not vectorize or interleaving the loop. 9213 ORE->emit([&]() { 9214 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9215 L->getStartLoc(), L->getHeader()) 9216 << VecDiagMsg.second; 9217 }); 9218 ORE->emit([&]() { 9219 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9220 L->getStartLoc(), L->getHeader()) 9221 << IntDiagMsg.second; 9222 }); 9223 return false; 9224 } else if (!VectorizeLoop && InterleaveLoop) { 9225 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9226 ORE->emit([&]() { 9227 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9228 L->getStartLoc(), L->getHeader()) 9229 << VecDiagMsg.second; 9230 }); 9231 } else if (VectorizeLoop && !InterleaveLoop) { 9232 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9233 << ") in " << DebugLocStr << '\n'); 9234 ORE->emit([&]() { 9235 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9236 L->getStartLoc(), L->getHeader()) 9237 << IntDiagMsg.second; 9238 }); 9239 } else if (VectorizeLoop && InterleaveLoop) { 9240 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9241 << ") in " << DebugLocStr << '\n'); 9242 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9243 } 9244 9245 LVP.setBestPlan(VF.Width, IC); 9246 9247 using namespace ore; 9248 bool DisableRuntimeUnroll = false; 9249 MDNode *OrigLoopID = L->getLoopID(); 9250 9251 if (!VectorizeLoop) { 9252 assert(IC > 1 && "interleave count should not be 1 or 0"); 9253 // If we decided that it is not legal to vectorize the loop, then 9254 // interleave it. 9255 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9256 BFI, PSI); 9257 LVP.executePlan(Unroller, DT); 9258 9259 ORE->emit([&]() { 9260 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9261 L->getHeader()) 9262 << "interleaved loop (interleaved count: " 9263 << NV("InterleaveCount", IC) << ")"; 9264 }); 9265 } else { 9266 // If we decided that it is *legal* to vectorize the loop, then do it. 9267 9268 // Consider vectorizing the epilogue too if it's profitable. 9269 VectorizationFactor EpilogueVF = 9270 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9271 if (EpilogueVF.Width.isVector()) { 9272 9273 // The first pass vectorizes the main loop and creates a scalar epilogue 9274 // to be vectorized by executing the plan (potentially with a different 9275 // factor) again shortly afterwards. 9276 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9277 EpilogueVF.Width.getKnownMinValue(), 1); 9278 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9279 &LVL, &CM, BFI, PSI); 9280 9281 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9282 LVP.executePlan(MainILV, DT); 9283 ++LoopsVectorized; 9284 9285 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9286 formLCSSARecursively(*L, *DT, LI, SE); 9287 9288 // Second pass vectorizes the epilogue and adjusts the control flow 9289 // edges from the first pass. 9290 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9291 EPI.MainLoopVF = EPI.EpilogueVF; 9292 EPI.MainLoopUF = EPI.EpilogueUF; 9293 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9294 ORE, EPI, &LVL, &CM, BFI, PSI); 9295 LVP.executePlan(EpilogILV, DT); 9296 ++LoopsEpilogueVectorized; 9297 9298 if (!MainILV.areSafetyChecksAdded()) 9299 DisableRuntimeUnroll = true; 9300 } else { 9301 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9302 &LVL, &CM, BFI, PSI); 9303 LVP.executePlan(LB, DT); 9304 ++LoopsVectorized; 9305 9306 // Add metadata to disable runtime unrolling a scalar loop when there are 9307 // no runtime checks about strides and memory. A scalar loop that is 9308 // rarely used is not worth unrolling. 9309 if (!LB.areSafetyChecksAdded()) 9310 DisableRuntimeUnroll = true; 9311 } 9312 9313 // Report the vectorization decision. 9314 ORE->emit([&]() { 9315 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9316 L->getHeader()) 9317 << "vectorized loop (vectorization width: " 9318 << NV("VectorizationFactor", VF.Width) 9319 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9320 }); 9321 } 9322 9323 Optional<MDNode *> RemainderLoopID = 9324 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9325 LLVMLoopVectorizeFollowupEpilogue}); 9326 if (RemainderLoopID.hasValue()) { 9327 L->setLoopID(RemainderLoopID.getValue()); 9328 } else { 9329 if (DisableRuntimeUnroll) 9330 AddRuntimeUnrollDisableMetaData(L); 9331 9332 // Mark the loop as already vectorized to avoid vectorizing again. 9333 Hints.setAlreadyVectorized(); 9334 } 9335 9336 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9337 return true; 9338 } 9339 9340 LoopVectorizeResult LoopVectorizePass::runImpl( 9341 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9342 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9343 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9344 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9345 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9346 SE = &SE_; 9347 LI = &LI_; 9348 TTI = &TTI_; 9349 DT = &DT_; 9350 BFI = &BFI_; 9351 TLI = TLI_; 9352 AA = &AA_; 9353 AC = &AC_; 9354 GetLAA = &GetLAA_; 9355 DB = &DB_; 9356 ORE = &ORE_; 9357 PSI = PSI_; 9358 9359 // Don't attempt if 9360 // 1. the target claims to have no vector registers, and 9361 // 2. interleaving won't help ILP. 9362 // 9363 // The second condition is necessary because, even if the target has no 9364 // vector registers, loop vectorization may still enable scalar 9365 // interleaving. 9366 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9367 TTI->getMaxInterleaveFactor(1) < 2) 9368 return LoopVectorizeResult(false, false); 9369 9370 bool Changed = false, CFGChanged = false; 9371 9372 // The vectorizer requires loops to be in simplified form. 9373 // Since simplification may add new inner loops, it has to run before the 9374 // legality and profitability checks. This means running the loop vectorizer 9375 // will simplify all loops, regardless of whether anything end up being 9376 // vectorized. 9377 for (auto &L : *LI) 9378 Changed |= CFGChanged |= 9379 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9380 9381 // Build up a worklist of inner-loops to vectorize. This is necessary as 9382 // the act of vectorizing or partially unrolling a loop creates new loops 9383 // and can invalidate iterators across the loops. 9384 SmallVector<Loop *, 8> Worklist; 9385 9386 for (Loop *L : *LI) 9387 collectSupportedLoops(*L, LI, ORE, Worklist); 9388 9389 LoopsAnalyzed += Worklist.size(); 9390 9391 // Now walk the identified inner loops. 9392 while (!Worklist.empty()) { 9393 Loop *L = Worklist.pop_back_val(); 9394 9395 // For the inner loops we actually process, form LCSSA to simplify the 9396 // transform. 9397 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9398 9399 Changed |= CFGChanged |= processLoop(L); 9400 } 9401 9402 // Process each loop nest in the function. 9403 return LoopVectorizeResult(Changed, CFGChanged); 9404 } 9405 9406 PreservedAnalyses LoopVectorizePass::run(Function &F, 9407 FunctionAnalysisManager &AM) { 9408 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9409 auto &LI = AM.getResult<LoopAnalysis>(F); 9410 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9411 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9412 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9413 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9414 auto &AA = AM.getResult<AAManager>(F); 9415 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9416 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9417 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9418 MemorySSA *MSSA = EnableMSSALoopDependency 9419 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9420 : nullptr; 9421 9422 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9423 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9424 [&](Loop &L) -> const LoopAccessInfo & { 9425 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9426 TLI, TTI, nullptr, MSSA}; 9427 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9428 }; 9429 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9430 ProfileSummaryInfo *PSI = 9431 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9432 LoopVectorizeResult Result = 9433 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9434 if (!Result.MadeAnyChange) 9435 return PreservedAnalyses::all(); 9436 PreservedAnalyses PA; 9437 9438 // We currently do not preserve loopinfo/dominator analyses with outer loop 9439 // vectorization. Until this is addressed, mark these analyses as preserved 9440 // only for non-VPlan-native path. 9441 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9442 if (!EnableVPlanNativePath) { 9443 PA.preserve<LoopAnalysis>(); 9444 PA.preserve<DominatorTreeAnalysis>(); 9445 } 9446 PA.preserve<BasicAA>(); 9447 PA.preserve<GlobalsAA>(); 9448 if (!Result.MadeCFGChange) 9449 PA.preserveSet<CFGAnalyses>(); 9450 return PA; 9451 } 9452